!--------------------------------------------------------------------------------------------------!
! Copyright (C) by the DBCSR developers group - All rights reserved                                !
! Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved                            !
! This file is part of the DBCSR library.                                                          !
!                                                                                                  !
! For information on the license, see the LICENSE file.                                            !
! For further information please visit https://dbcsr.cp2k.org                                      !
! SPDX-License-Identifier: GPL-2.0+                                                                !
!--------------------------------------------------------------------------------------------------!

MODULE dbcsr_mm_cannon
   !! First layer of the dbcsr matrix-matrix multiplication.
   !! It performs the MPI parallelization according to Cannon's algorithm.
   !! <b>Modification history:</b>
   !! - 2010-02-23 Moved from dbcsr_operations
   !! - 2011-11    Moved parameter-stack processing routines to
   !! dbcsr_mm_methods.
   !! - 2013-01    reorganized code (Ole Schuett)

   USE dbcsr_acc_event, ONLY: acc_event_synchronize
   USE dbcsr_acc_device, ONLY: acc_device_synchronize
   USE dbcsr_acc_stream, ONLY: acc_stream_synchronize
   USE dbcsr_acc_devmem, ONLY: acc_devmem_cptr
   USE dbcsr_array_types, ONLY: array_data, &
                                array_exists, &
                                array_i1d_obj, &
                                array_size
   USE dbcsr_block_operations, ONLY: dbcsr_block_conjg, &
                                     dbcsr_block_copy_aa, &
                                     dbcsr_block_real_neg, &
                                     dbcsr_block_scale, &
                                     dbcsr_block_transpose_aa, &
                                     dbcsr_block_transpose
   USE dbcsr_config, ONLY: dbcsr_cfg, &
                           use_acc
   USE dbcsr_data_methods, ONLY: &
      dbcsr_data_clear_pointer, dbcsr_data_ensure_size, dbcsr_data_get_size, &
      dbcsr_data_get_size_referenced, dbcsr_data_hold, dbcsr_data_host2dev, dbcsr_data_init, &
      dbcsr_data_new, dbcsr_data_release, dbcsr_data_set_pointer, &
      dbcsr_data_set_size_referenced, dbcsr_scalar_are_equal, dbcsr_scalar_negative, &
      dbcsr_scalar_one, dbcsr_get_data_p_s, dbcsr_get_data_p_d, dbcsr_get_data_p_c, dbcsr_get_data_p_z
   USE dbcsr_data_types, ONLY: dbcsr_datatype_sizeof
   USE dbcsr_dist_methods, ONLY: dbcsr_distribution_col_dist, &
                                 dbcsr_distribution_has_threads, &
                                 dbcsr_distribution_max_col_dist, &
                                 dbcsr_distribution_max_row_dist, &
                                 dbcsr_distribution_mp, &
                                 dbcsr_distribution_row_dist
   USE dbcsr_index_operations, ONLY: dbcsr_count_row_index, &
                                     dbcsr_has_local_row_index, &
                                     dbcsr_index_prune_deleted, &
                                     dbcsr_make_index_list, &
                                     dbcsr_make_index_local_row, &
                                     dbcsr_repoint_index
   USE dbcsr_io, ONLY: dbcsr_print
   USE dbcsr_iterator_operations, ONLY: dbcsr_iterator_blocks_left, &
                                        dbcsr_iterator_next_block, &
                                        dbcsr_iterator_start, &
                                        dbcsr_iterator_stop
   USE dbcsr_mem_methods, ONLY: dbcsr_mempool_limit_capacity
   USE dbcsr_methods, ONLY: &
      dbcsr_destroy_array, dbcsr_distribution, dbcsr_get_data_type, dbcsr_get_index_memory_type, &
      dbcsr_has_symmetry, dbcsr_image_dist_hold, dbcsr_image_dist_init, &
      dbcsr_image_dist_release, dbcsr_max_col_size, dbcsr_max_row_size, dbcsr_nblkcols_local, &
      dbcsr_nblkrows_local, dbcsr_nblkrows_total, dbcsr_release, dbcsr_valid_index
   USE dbcsr_mm_common, ONLY: &
      acc_transpose_blocks, calculate_norms, count_mpi_statistics, dbcsr_mm_multrec_type_p, &
      dbcsr_mpi_statistics, enumerate_blk_sizes, huge_norm, local_filter, max_memory, &
      memtype_abpanel_1, memtype_abpanel_2, memtype_mpi_buffer, memtype_trsbuffer_1, &
      memtype_normsbuf, memtype_offsetsbuf, memtype_nelemsbuf, acc_calculate_norms, &
      memtype_trsbuffer_2, product_matrix_size_guess, rec_sort_index, setup_buffer_matrix
   USE dbcsr_mm_dist_operations, ONLY: dbcsr_get_local_vcols, &
                                       dbcsr_get_local_vrows, &
                                       dbcsr_reset_vlocals, &
                                       image_calculator
   USE dbcsr_mm_multrec, ONLY: dbcsr_mm_multrec_finalize, &
                               dbcsr_mm_multrec_init, &
                               dbcsr_mm_multrec_multiply
   USE dbcsr_mp_methods, ONLY: &
      dbcsr_mp_grid_setup, dbcsr_mp_group, dbcsr_mp_has_subgroups, dbcsr_mp_my_col_group, &
      dbcsr_mp_my_row_group, dbcsr_mp_mynode, dbcsr_mp_mypcol, dbcsr_mp_myprow, dbcsr_mp_npcols, &
      dbcsr_mp_nprows, dbcsr_mp_numnodes, dbcsr_mp_pgrid
   USE dbcsr_mp_operations, ONLY: dbcsr_irecv_any, &
                                  dbcsr_isend_any, &
                                  hybrid_alltoall_any, &
                                  hybrid_alltoall_i1
   USE dbcsr_operations, ONLY: dbcsr_copy, &
                               dbcsr_crop_matrix
   USE dbcsr_ptr_util, ONLY: ensure_array_size
   USE dbcsr_transformations, ONLY: dbcsr_make_dense_low
   USE dbcsr_types, ONLY: &
      dbcsr_2d_array_type, dbcsr_data_obj, dbcsr_distribution_obj, dbcsr_imagedistribution_obj, &
      dbcsr_iterator, dbcsr_memtype_type, dbcsr_meta_size, dbcsr_mp_obj, dbcsr_scalar_type, &
      dbcsr_slot_home_coli, dbcsr_slot_home_pcol, dbcsr_slot_home_prow, dbcsr_slot_home_rowi, &
      dbcsr_slot_home_vpcol, dbcsr_slot_home_vprow, dbcsr_slot_size, dbcsr_type, &
      dbcsr_type_complex_4, dbcsr_type_complex_8, dbcsr_type_int_4, dbcsr_type_no_symmetry, &
      dbcsr_type_real_4, dbcsr_type_real_8
   USE dbcsr_dist_util, ONLY: count_bins, &
                              dbcsr_checksum
   USE dbcsr_work_operations, ONLY: dbcsr_create, &
                                    dbcsr_special_finalize, &
                                    dbcsr_work_create
   USE dbcsr_kinds, ONLY: dp, &
                          int_8, &
                          real_4, &
                          real_8, &
                          sp
   USE dbcsr_machine, ONLY: default_output_unit, &
                            m_memory
   USE dbcsr_mpiwrap, ONLY: mp_allgather, &
                            mp_alltoall, &
                            mp_environ, &
                            mp_irecv, &
                            mp_isend, &
                            mp_request_null, &
                            mp_sum, &
                            mp_testany, &
                            mp_waitall, &
                            mp_comm_type, &
                            mp_request_type

   USE ISO_C_BINDING, ONLY: C_F_POINTER

#include "base/dbcsr_base_uses.f90"

!$ USE OMP_LIB, ONLY: omp_get_thread_num, omp_get_num_threads

   IMPLICIT NONE

   PRIVATE

   CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_mm_cannon'
   LOGICAL, PARAMETER :: debug_mod = .FALSE.
   LOGICAL, PARAMETER :: careful_mod = .FALSE.

   INTERFACE dbcsr_switch
      MODULE PROCEDURE dbcsr_switch_sets
      MODULE PROCEDURE dbcsr_switch_d_ptrs
   END INTERFACE

   PUBLIC :: multiply_cannon, make_m2s, &
             multiply_cannon_g2g

CONTAINS

   SUBROUTINE make_m2s(matrix, m2s, rdist, dense_rdist, &
      !! Make images from the matrix (left or right)
                       use_dense_mult, ab_dense, predistribute, &
                       f_k, l_k, f_row, l_row, f_col, l_col, &
                       row_blk_size, col_blk_size, &
                       k_vmap, m_map, n_map, &
                       alpha)
      TYPE(dbcsr_type), INTENT(IN)                       :: matrix
      TYPE(dbcsr_2d_array_type), INTENT(OUT), POINTER    :: m2s
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: rdist, dense_rdist
      LOGICAL, INTENT(IN)                                :: use_dense_mult, ab_dense
      CHARACTER, INTENT(IN)                              :: predistribute
      INTEGER, INTENT(IN)                                :: f_k, l_k, f_row, l_row, f_col, l_col
      TYPE(array_i1d_obj), INTENT(INOUT)                 :: row_blk_size, col_blk_size
      TYPE(array_i1d_obj), INTENT(IN)                    :: k_vmap, m_map, n_map
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: alpha

      CHARACTER(len=*), PARAMETER :: routineN = 'make_m2s'
      INTEGER                                            :: handle, i, im, j, jm
      INTEGER, DIMENSION(4)                              :: f_crop
      LOGICAL                                            :: do_scale, thread_redist
      TYPE(array_i1d_obj)                                :: col_map, row_map
      TYPE(dbcsr_type)                                   :: dense_template, matrix_tmp

      CALL timeset(routineN, handle)
      ALLOCATE (m2s)
      do_scale = .FALSE.
      IF (PRESENT(alpha)) THEN
         IF (.NOT. dbcsr_scalar_are_equal(alpha, dbcsr_scalar_one(alpha%data_type))) THEN
            do_scale = .TRUE.
         END IF
      END IF

      IF (do_scale) THEN
         ! Copy and scale matrix if alpha is not 1.
         CALL dbcsr_make_images(matrix, m2s, rdist, &
                                predistribute=predistribute, &
                                no_copy_data=use_dense_mult, scale_value=alpha)
      ELSE
         CALL dbcsr_make_images(matrix, m2s, rdist, &
                                predistribute=predistribute, &
                                no_copy_data=use_dense_mult)
      END IF

      im = SIZE(m2s%mats, 1)
      jm = SIZE(m2s%mats, 2)
      SELECT CASE (predistribute)
      CASE ('L')
         f_crop = (/f_row, l_row, f_k, l_k/)
         row_map = m_map
         col_map = k_vmap
         thread_redist = .TRUE.
      CASE default
         f_crop = (/f_k, l_k, f_col, l_col/)
         row_map = k_vmap
         col_map = n_map
         thread_redist = .FALSE.
      END SELECT

      ! Post-processing of images.
      DO i = 1, im
         DO j = 1, jm
            CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist)
            ! Crop if necessary
            IF (ANY(f_crop .NE. 0)) THEN
               matrix_tmp = dbcsr_type()
               CALL dbcsr_crop_matrix(matrix_tmp, m2s%mats(i, j), &
                                      full_row_bounds=f_crop(1:2), &
                                      full_column_bounds=f_crop(3:4), &
                                      shallow_data=.FALSE.)
               CALL dbcsr_release(m2s%mats(i, j))
               CALL dbcsr_copy(m2s%mats(i, j), matrix_tmp, shallow_data=.TRUE.)
               CALL dbcsr_release(matrix_tmp)
               CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist)
            END IF
         END DO
      END DO

      IF (ab_dense) THEN
         dense_template = dbcsr_type()
         CALL dbcsr_create(dense_template, template=matrix, &
                           dist=dense_rdist%i%main, &
                           row_blk_size_obj=row_blk_size, col_blk_size_obj=col_blk_size)
         CALL dbcsr_make_images_dense(m2s, dense_rdist, &
                                      row_map=row_map, col_map=col_map, &
                                      join_cols=use_dense_mult, join_rows=ab_dense, &
                                      new_template=dense_template)
         CALL dbcsr_image_dist_release(rdist)
         rdist = dense_rdist
         CALL dbcsr_image_dist_hold(rdist)
         DO i = 1, im
            DO j = 1, jm
               CALL dbcsr_reset_vlocals(m2s%mats(i, j), rdist)
            END DO
         END DO
      END IF

      DO i = 1, im
         DO j = 1, jm
            ! Convert to local-row index
            CALL dbcsr_make_index_local_row(m2s%mats(i, j))
            ! Convert to list index
            CALL dbcsr_make_index_list(m2s%mats(i, j), thread_redist=thread_redist)
         END DO
      END DO

      IF (ab_dense) THEN
         CALL dbcsr_image_dist_release(dense_rdist)
         CALL dbcsr_release(dense_template)
      END IF

      CALL timestop(handle)
   END SUBROUTINE make_m2s

   SUBROUTINE dbcsr_make_images(source, normalized, target_image_dist, &
                                predistribute, no_copy_data, scale_value)
      !! Creates row and column images of a matrix.

      TYPE(dbcsr_type), INTENT(IN)                       :: source
         !! input matrix
      TYPE(dbcsr_2d_array_type), INTENT(OUT)             :: normalized
         !! image array of the normalized matrix
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: target_image_dist
         !! normalize to this image distribution
      CHARACTER, INTENT(IN), OPTIONAL                    :: predistribute
         !! predistribute data for multiplication
      LOGICAL, INTENT(IN), OPTIONAL                      :: no_copy_data
         !! try to not merge data at the end
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: scale_value
         !! scale with this value

!   ---------------------------------------------------------------------------

      IF (dbcsr_cfg%use_mpi_rma%val) &
         DBCSR_ABORT("RMA algo not supported here!")
      IF (.NOT. dbcsr_valid_index(source)) &
         DBCSR_ABORT("Matrix not initialized.")
      CALL make_images(source, normalized, &
                       target_image_dist, desymmetrize=dbcsr_has_symmetry(source), &
                       predistribute=predistribute, &
                       no_copy_data=no_copy_data, &
                       scale_value=scale_value)
      normalized%image_dist = target_image_dist
      CALL dbcsr_image_dist_hold(normalized%image_dist)
   END SUBROUTINE dbcsr_make_images

   SUBROUTINE make_images(ism, ums, target_imgdist, desymmetrize, predistribute, &
                          no_copy_data, scale_value)
      !! Makes column-based and row-based images of a matrix.

      TYPE(dbcsr_type), INTENT(IN)                       :: ism
         !! input symmetric matrix
      TYPE(dbcsr_2d_array_type), INTENT(OUT)             :: ums
         !! normalized matrices
      TYPE(dbcsr_imagedistribution_obj), INTENT(IN)      :: target_imgdist
         !! image distribution to normalize to
      LOGICAL, INTENT(IN), OPTIONAL                      :: desymmetrize
         !! desymmetrize a symmetric matrix
      CHARACTER, INTENT(IN), OPTIONAL                    :: predistribute
         !! predistribute data for multiplication
      LOGICAL, INTENT(IN), OPTIONAL                      :: no_copy_data
         !! try to not merge data at the end
      TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL      :: scale_value
         !! scale with this value

      CHARACTER(len=*), PARAMETER :: routineN = 'make_images'
      INTEGER, PARAMETER                                 :: metalen = 5
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      CHARACTER                                          :: predist_type, predist_type_fwd
      INTEGER :: blk, blk_l, blk_p, bp, col, col_img, col_size, coli, data_type, dst_p, handle, &
                 handle2, ithread, ncol_images, nrow_images, nsymmetries, nthreads, numproc, &
                 nze, pcol, prow, row, row_img, row_size, rowi, sd_pos, sm_pos, src_p, stored_blk_p, &
                 stored_col, stored_row, symmetry_i, tr_col_size, tr_row_size, vcol, vrow
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: myt_sdp, myt_smp, rd_disp, recv_meta, &
                                                            rm_disp, sd_disp, sdp, send_meta, &
                                                            sm_disp, smp
      INTEGER, ALLOCATABLE, DIMENSION(:, :)              :: all_total_send_offset, blk_ps, blks, &
                                                            myt_total_send_count, &
                                                            total_recv_count, total_send_count
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: myt_send_count, recv_count, send_count
      INTEGER, DIMENSION(:), CONTIGUOUS, POINTER         :: col_blk_size, col_dist, col_img_dist, &
                                                            row_blk_size, row_dist, row_img_dist
      INTEGER, DIMENSION(:, :), CONTIGUOUS, POINTER      :: blacs2mpi
      LOGICAL                                            :: nocopy, tr
      TYPE(dbcsr_data_obj)                               :: received_data_area, recv_data_area, &
                                                            send_data_area
      TYPE(dbcsr_distribution_obj)                       :: old_dist, target_dist
      TYPE(dbcsr_iterator)                               :: iter
      TYPE(dbcsr_memtype_type)                           :: data_memory_type
      TYPE(dbcsr_mp_obj)                                 :: mp_obj
      TYPE(dbcsr_scalar_type)                            :: scale_neg_one
      TYPE(dbcsr_type)                                   :: sm
      TYPE(mp_comm_type)                                 :: mp_group

!   ---------------------------------------------------------------------------
! Check input matrix
! Set convenient variables to access input matrix info
!

      CALL timeset(routineN, handle)
      nocopy = .FALSE.
      IF (PRESENT(no_copy_data)) nocopy = no_copy_data
      sm = ism
      nsymmetries = 1
      IF (PRESENT(desymmetrize)) THEN
         IF (desymmetrize .AND. sm%symmetry) THEN
            nsymmetries = 2
         END IF
      END IF
      SELECT CASE (predistribute)
      CASE ('L', 'l')
         predist_type = 'L'
         predist_type_fwd = 'l'
      CASE ('R', 'r')
         predist_type = 'R'
         predist_type_fwd = 'r'
      CASE default
         DBCSR_ABORT("Incorrect pre-shift specifier.")
      END SELECT
      data_type = sm%data_type
      IF (data_type .NE. dbcsr_type_real_8 .AND. &
          data_type .NE. dbcsr_type_real_4 .AND. &
          data_type .NE. dbcsr_type_complex_8 .AND. &
          data_type .NE. dbcsr_type_complex_4) &
         DBCSR_ABORT("Invalid data type.")
      scale_neg_one = dbcsr_scalar_negative(dbcsr_scalar_one(data_type))
      row_blk_size => array_data(sm%row_blk_size)
      col_blk_size => array_data(sm%col_blk_size)
      old_dist = dbcsr_distribution(ism)
      target_dist = target_imgdist%i%main
      row_dist => dbcsr_distribution_row_dist(target_dist)
      col_dist => dbcsr_distribution_col_dist(target_dist)
      IF (sm%symmetry) THEN
         IF (SIZE(row_dist) .NE. SIZE(col_dist)) &
            DBCSR_WARN('Unequal row and column distributions for symmetric matrix.')
      END IF
      nrow_images = target_imgdist%i%row_decimation
      IF (nrow_images .GT. 1) THEN
         row_img_dist => array_data(target_imgdist%i%row_image)
      ELSE
         NULLIFY (row_img_dist)
      END IF
      ncol_images = target_imgdist%i%col_decimation
      IF (ncol_images .GT. 1) THEN
         col_img_dist => array_data(target_imgdist%i%col_image)
      ELSE
         NULLIFY (col_img_dist)
      END IF
      mp_obj = dbcsr_distribution_mp(target_dist)
      blacs2mpi => dbcsr_mp_pgrid(mp_obj)
      numproc = dbcsr_mp_numnodes(mp_obj)
      mp_group = dbcsr_mp_group(mp_obj)
      IF (dbcsr_distribution_max_row_dist(old_dist) .GT. UBOUND(blacs2mpi, 1)) &
         DBCSR_ABORT('Row distribution references unexistent processor rows')
      IF (dbg) THEN
         IF (dbcsr_distribution_max_row_dist(old_dist) .NE. UBOUND(blacs2mpi, 1)) &
            DBCSR_WARN('Range of row distribution not equal to processor rows')
      END IF
      IF (dbcsr_distribution_max_col_dist(old_dist) .GT. UBOUND(blacs2mpi, 2)) &
         DBCSR_ABORT('Col distribution references unexistent processor cols')
      IF (dbg) THEN
         IF (dbcsr_distribution_max_col_dist(old_dist) .NE. UBOUND(blacs2mpi, 2)) &
            DBCSR_WARN('Range of col distribution not equal to processor cols')
      END IF

      ! Check threads configuration
!$    IF (.NOT. dbcsr_distribution_has_threads(old_dist)) &
!$       DBCSR_ABORT("Thread distribution not defined")

      ! Allocate shared temporary buffers
      !
      ALLOCATE (send_count(2, nrow_images, ncol_images, 0:numproc - 1)); send_count = 0
      ALLOCATE (recv_count(2, nrow_images, ncol_images, 0:numproc - 1))
      ALLOCATE (total_send_count(2, 0:numproc - 1)); total_send_count = 0
      ALLOCATE (total_recv_count(2, 0:numproc - 1))
      ALLOCATE (sdp(0:numproc - 1))
      ALLOCATE (smp(0:numproc - 1))
      ALLOCATE (sd_disp(0:numproc - 1)); sd_disp(0) = 1
      ALLOCATE (sm_disp(0:numproc - 1)); sm_disp(0) = 1
      ALLOCATE (rd_disp(0:numproc - 1)); rd_disp(0) = 1
      ALLOCATE (rm_disp(0:numproc - 1)); rm_disp(0) = 1
      ALLOCATE (all_total_send_offset(2, 0:numproc - 1))
      ALLOCATE (blk_ps(nrow_images, ncol_images)); blk_ps = 1
      ALLOCATE (blks(nrow_images, ncol_images)); blks = 1
      !
      ! Allocate and init mats
      !
      ALLOCATE (ums%mats(nrow_images, ncol_images))
      data_memory_type = memtype_abpanel_1
      DO row_img = 1, nrow_images
         DO col_img = 1, ncol_images
            ums%mats(row_img, col_img) = dbcsr_type()
            CALL dbcsr_create(ums%mats(row_img, col_img), "imaged "//sm%name, &
                              target_dist, &
                              dbcsr_type_no_symmetry, &
                              row_blk_size_obj=sm%row_blk_size, col_blk_size_obj=sm%col_blk_size, &
                              nze=0, data_type=data_type, &
                              max_rbs=sm%max_rbs, max_cbs=sm%max_cbs, &
                              row_blk_offset=sm%row_blk_offset, col_blk_offset=sm%col_blk_offset, &
                              thread_dist=sm%dist, &
                              data_memory_type=data_memory_type, &
                              index_memory_type=memtype_mpi_buffer)
            ums%mats(row_img, col_img)%negate_real = sm%negate_real
            ums%mats(row_img, col_img)%negate_imaginary = sm%negate_imaginary
         END DO
      END DO

      nthreads = 1
!$OMP PARALLEL DEFAULT (NONE) &
!$OMP PRIVATE (ithread, symmetry_i, row_img, col_img, &
!$OMP          myt_send_count, myt_total_send_count, &
!$OMP          iter, row, col, blk, row_size, col_size, stored_row, stored_col, &
!$OMP          prow, pcol, rowi, coli, vrow, vcol, dst_p, nze, myt_smp, myt_sdp, &
!$OMP          blk_p, bp, sd_pos, sm_pos,tr, &
!$OMP          tr_row_size, tr_col_size) &
!$OMP SHARED (nthreads, nsymmetries, row_img_dist, col_img_dist, &
!$OMP         nrow_images, ncol_images, numproc, scale_value, &
!$OMP         ums, sm, ism, target_imgdist, row_dist, col_dist,&
!$OMP         predist_type_fwd, blacs2mpi, row_blk_size, col_blk_size, &
!$OMP         send_count, recv_count, handle2,mp_group, &
!$OMP         total_send_count, total_recv_count, recv_data_area, nocopy, &
!$OMP         data_type, recv_meta, send_data_area, send_meta, &
!$OMP         sd_disp, sm_disp, rd_disp, rm_disp, all_total_send_offset, blk_ps, blks, &
!$OMP         received_data_area, scale_neg_one, memtype_abpanel_1)
      ithread = 0
!$    ithread = omp_get_thread_num()
!$OMP MASTER
!$    nthreads = omp_get_num_threads()
!$OMP END MASTER

      ! Allocate thread private data
      !
      ALLOCATE (myt_send_count(2, nrow_images, ncol_images, 0:numproc - 1)); myt_send_count(:, :, :, :) = 0
      ALLOCATE (myt_total_send_count(2, 0:numproc - 1))
      ! Thread-local pointers of the current adding position into the send buffers
      ALLOCATE (myt_smp(0:numproc - 1), myt_sdp(0:numproc - 1))

      ! Count sizes for sending
      !
      CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.)
      DO WHILE (dbcsr_iterator_blocks_left(iter))
         CALL dbcsr_iterator_next_block(iter, row, col, blk, &
                                        row_size=row_size, col_size=col_size)
         nze = row_size*col_size
         IF (nze .EQ. 0) CYCLE
         DO symmetry_i = 1, nsymmetries
            IF (symmetry_i .EQ. 1) THEN
               stored_row = row; stored_col = col
            ELSE
               IF (row .EQ. col) CYCLE
               stored_row = col; stored_col = row
            END IF
            ! Where do we send this block?
            row_img = 1
            IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
            col_img = 1
            IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
            CALL image_calculator(target_imgdist, &
                                  prow=prow, rowi=rowi, &
                                  pcol=pcol, coli=coli, &
                                  vprow=vrow, vpcol=vcol, &
                                  myprow=row_dist(stored_row), myrowi=row_img, &
                                  mypcol=col_dist(stored_col), mycoli=col_img, &
                                  shifting=predist_type_fwd)
            dst_p = blacs2mpi(prow, pcol)
            ! These counts are meant for the thread that processes this row.
            myt_send_count(1, rowi, coli, dst_p) = &
               myt_send_count(1, rowi, coli, dst_p) + 1
            myt_send_count(2, rowi, coli, dst_p) = &
               myt_send_count(2, rowi, coli, dst_p) + nze
         END DO ! symmetry_i
      END DO
      CALL dbcsr_iterator_stop(iter)
      DO dst_p = 0, numproc - 1
         myt_total_send_count(1, dst_p) = SUM(myt_send_count(1, :, :, dst_p))
         myt_total_send_count(2, dst_p) = SUM(myt_send_count(2, :, :, dst_p))
      END DO
      ! Merge the send counts
!$OMP CRITICAL
      send_count(:, :, :, :) = send_count(:, :, :, :) + myt_send_count(:, :, :, :)
      total_send_count(:, :) = total_send_count(:, :) + myt_total_send_count(:, :)
!$OMP END CRITICAL
!$OMP BARRIER
!$OMP MASTER
      CALL timeset(routineN//"_sizes", handle2)
      CALL mp_alltoall(send_count, recv_count, 2*nrow_images*ncol_images, &
                       mp_group)
      CALL timestop(handle2)
!$OMP END MASTER
!$OMP BARRIER
      ! Fill in the meta data structures and copy the data.
!$OMP DO
      DO dst_p = 0, numproc - 1
         total_recv_count(1, dst_p) = SUM(recv_count(1, :, :, dst_p))
         total_recv_count(2, dst_p) = SUM(recv_count(2, :, :, dst_p))
      END DO
!$OMP MASTER
      ! Allocate data structures needed for data exchange.
      CALL dbcsr_data_init(recv_data_area)
      CALL dbcsr_data_init(send_data_area)
      IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN
         ! For some cases the faster dbcsr_special_finalize(reshuffle=.FALSE.) can be used.
         ! This basically makes this working matrix the actual data-area.
         ! Hence, for those cases we have to use data_memory_type already here.
         CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)), memory_type=memtype_abpanel_1)
         ! Some MPI implementations have high overhead when encountering a new buffer.
         ! Therefore we also use the memory pool for the send buffer.
         CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)), memory_type=memtype_abpanel_1)
      ELSE
         CALL dbcsr_data_new(recv_data_area, data_type, SUM(total_recv_count(2, :)))
         CALL dbcsr_data_new(send_data_area, data_type, SUM(total_send_count(2, :)))
      END IF
      ALLOCATE (recv_meta(metalen*SUM(total_recv_count(1, :))))
      ALLOCATE (send_meta(metalen*SUM(total_send_count(1, :))))
      ! Calculate displacements for processors needed for the exchanges.
      DO dst_p = 1, numproc - 1
         sm_disp(dst_p) = sm_disp(dst_p - 1) &
                          + metalen*total_send_count(1, dst_p - 1)
         sd_disp(dst_p) = sd_disp(dst_p - 1) &
                          + total_send_count(2, dst_p - 1)
         rm_disp(dst_p) = rm_disp(dst_p - 1) &
                          + metalen*total_recv_count(1, dst_p - 1)
         rd_disp(dst_p) = rd_disp(dst_p - 1) &
                          + total_recv_count(2, dst_p - 1)
      END DO
      myt_smp(:) = sm_disp(:)
      myt_sdp(:) = sd_disp(:)
      IF (nthreads .GT. 1) THEN
         all_total_send_offset(1, :) = myt_smp(:) + metalen*myt_total_send_count(1, :)
         all_total_send_offset(2, :) = myt_sdp(:) + myt_total_send_count(2, :)
      END IF
!$OMP END MASTER
!$OMP BARRIER
      IF (ithread .GT. 0) THEN
!$OMP CRITICAL
         myt_smp(:) = all_total_send_offset(1, :)
         myt_sdp(:) = all_total_send_offset(2, :)
         all_total_send_offset(1, :) &
            = all_total_send_offset(1, :) + metalen*myt_total_send_count(1, :)
         all_total_send_offset(2, :) &
            = all_total_send_offset(2, :) + myt_total_send_count(2, :)
!$OMP END CRITICAL
      ELSE
         CALL dbcsr_data_init(received_data_area)
         received_data_area = recv_data_area
         CALL dbcsr_data_hold(received_data_area)
         DO row_img = 1, nrow_images
            DO col_img = 1, ncol_images
               CALL dbcsr_work_create(ums%mats(row_img, col_img), &
                                      SUM(recv_count(1, row_img, col_img, :)), n=1)
               CALL dbcsr_data_hold(received_data_area)
               CALL dbcsr_data_release(ums%mats(row_img, col_img)%wms(1)%data_area)
               ums%mats(row_img, col_img)%wms(1)%data_area = received_data_area
            END DO
         END DO
      END IF
!$OMP BARRIER
      ! Add timing call to the packing of the send buffers
      !
      CALL timeset(routineN//"_pack", handle2)
      ! Copies metadata and actual data to be sent into the send buffers.
      CALL dbcsr_iterator_start(iter, ism, shared=.TRUE.)
      SELECT CASE (data_type)
      CASE (dbcsr_type_real_4)
         CALL prepare_buffers_s(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_s(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      CASE (dbcsr_type_real_8)
         CALL prepare_buffers_d(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_d(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      CASE (dbcsr_type_complex_4)
         CALL prepare_buffers_c(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_c(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      CASE (dbcsr_type_complex_8)
         CALL prepare_buffers_z(sm%negate_real, sm%negate_imaginary, &
                                iter, row, col, blk, blk_p, bp, &
                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                row_img, col_img, nrow_images, ncol_images, &
                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                target_imgdist, prow, pcol, rowi, coli, &
                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                dbcsr_get_data_p_z(sm%data_area), &
                                send_data_area, scale_neg_one, scale_value)
      END SELECT
      CALL dbcsr_iterator_stop(iter)

      ! Deallocate thread private data
      !
      DEALLOCATE (myt_send_count)
      DEALLOCATE (myt_total_send_count)
      DEALLOCATE (myt_smp, myt_sdp)

      CALL timestop(handle2)
!$OMP END PARALLEL

      ! Exchange the data and metadata structures. In the interesting cases (square grids, row col distribution same),
      ! there are only very few processors that need to exchange data.
      ! The hybrid_alltoall deals with this by doing point to point communication
      CALL timeset(routineN//"_data", handle2)
      CALL hybrid_alltoall_any(send_data_area, total_send_count(2, :), sd_disp(:) - 1, &
                               recv_data_area, total_recv_count(2, :), rd_disp(:) - 1, &
                               mp_obj, &
                               most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE.)
      CALL hybrid_alltoall_i1( &
         send_meta(:), metalen*total_send_count(1, :), sm_disp(:) - 1, &
         recv_meta(:), metalen*total_recv_count(1, :), rm_disp(:) - 1, &
         most_ptp=.TRUE., remainder_ptp=.TRUE., no_hybrid=.FALSE., &
         mp_env=mp_obj)
      CALL timestop(handle2)

      ! Now create the work index and/or copy the relevant data from the
      ! receive buffer into the local indices.
      !
      DO src_p = 0, numproc - 1
         DO blk_l = 1, total_recv_count(1, src_p)
            stored_row = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1))
            stored_col = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 1)
            stored_blk_p = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 2)
            row_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 3)
            col_img = recv_meta(rm_disp(src_p) + metalen*(blk_l - 1) + 4)
            nze = row_blk_size(stored_row)*col_blk_size(stored_col)
            blk = blks(row_img, col_img)
            blks(row_img, col_img) = blks(row_img, col_img) + 1
            blk_ps(row_img, col_img) = blk_ps(row_img, col_img) + nze
            ums%mats(row_img, col_img)%wms(1)%row_i(blk) = stored_row
            ums%mats(row_img, col_img)%wms(1)%col_i(blk) = stored_col
            ums%mats(row_img, col_img)%wms(1)%blk_p(blk) = &
               SIGN(rd_disp(src_p) + ABS(stored_blk_p) - 1, stored_blk_p)
         END DO
      END DO

      ! Finalize the actual imaged matrices from the work matrices
      !
      DO row_img = 1, nrow_images
         DO col_img = 1, ncol_images
            ums%mats(row_img, col_img)%wms(1)%lastblk = blks(row_img, col_img) - 1
            ums%mats(row_img, col_img)%wms(1)%datasize = blk_ps(row_img, col_img) - 1
            !
            CALL dbcsr_data_set_size_referenced( &
               ums%mats(row_img, col_img)%wms(1)%data_area, &
               ums%mats(row_img, col_img)%wms(1)%datasize)
            IF (nrow_images .EQ. 1 .AND. ncol_images .EQ. 1 .OR. nocopy) THEN
               CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.FALSE.)
            ELSE
               CALL dbcsr_special_finalize(ums%mats(row_img, col_img), reshuffle=.TRUE.)
            END IF

            ! Save the home process and image row and column
            CALL image_calculator(target_imgdist, &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_prow), &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_rowi), &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_pcol), &
                                  ums%mats(row_img, col_img)%index(dbcsr_slot_home_coli), &
                                  vprow=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vprow), &
                                  vpcol=ums%mats(row_img, col_img)%index(dbcsr_slot_home_vpcol), &
                                  myrowi=row_img, mycoli=col_img, &
                                  shifting=predist_type)
         END DO
      END DO

      ! Deallocate shared temporary buffers
      !
      DEALLOCATE (send_count, recv_count)
      DEALLOCATE (total_send_count, total_recv_count)
      DEALLOCATE (sdp, smp, sd_disp, sm_disp)
      DEALLOCATE (rd_disp, rm_disp)
      DEALLOCATE (all_total_send_offset)
      DEALLOCATE (blk_ps, blks)
      DEALLOCATE (recv_meta, send_meta)

      CALL dbcsr_data_release(send_data_area)
      CALL dbcsr_data_release(received_data_area)
      CALL dbcsr_data_release(recv_data_area)

      CALL timestop(handle)
   END SUBROUTINE make_images

   SUBROUTINE dbcsr_make_images_dense(images, new_rdist, &
                                      row_map, col_map, join_cols, join_rows, new_template)
      !! Makes dense matrices for the image matrices.
      !! @note Used for making matrices dense/undense

      TYPE(dbcsr_2d_array_type), INTENT(INOUT)           :: images
         !! current (undense) matrix images, output is the dense matrix images
      TYPE(dbcsr_imagedistribution_obj), INTENT(INOUT)   :: new_rdist
         !! the new image distribution for dense matrices
      TYPE(array_i1d_obj), INTENT(IN)                    :: row_map, col_map
         !! mapping of current (undense) rows to dense rows
         !! mapping of current (undense) columns to dense columns
      LOGICAL, INTENT(IN)                                :: join_cols, join_rows
         !! make columns dense, default is yes
         !! make rows dense, default is yes
      TYPE(dbcsr_type), INTENT(IN)                       :: new_template
         !! template dense matrix for creating image matrices

      CHARACTER(len=*), PARAMETER :: routineN = 'dbcsr_make_images_dense'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, mat_col, mat_row, mat_vpcol, &
                                                            mat_vprow
      INTEGER, DIMENSION(:), CONTIGUOUS, POINTER         :: und_col_blk_offsets, und_row_blk_offsets
      INTEGER, DIMENSION(dbcsr_meta_size)                :: old_meta
      REAL(kind=dp)                                      :: cs
      TYPE(array_i1d_obj)                                :: dense_local_vcols, dense_local_vrows, &
                                                            und_local_vcols, und_local_vrows
      TYPE(dbcsr_imagedistribution_obj)                  :: old_rdist
      TYPE(dbcsr_type)                                   :: tmp_mat

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      old_rdist = images%image_dist
      !
      DO mat_row = 1, images%image_dist%i%row_decimation
         DO mat_col = 1, images%image_dist%i%col_decimation
            IF (dbg) THEN
               cs = dbcsr_checksum(images%mats(mat_row, mat_col))
               WRITE (*, *) routineN//" cs pre", cs
            END IF
            mat_vprow = images%mats(mat_row, mat_col)%index(dbcsr_slot_home_vprow)
            mat_vpcol = images%mats(mat_row, mat_col)%index(dbcsr_slot_home_vpcol)
            und_row_blk_offsets => array_data(images%mats(mat_row, mat_col)%row_blk_offset)
            und_col_blk_offsets => array_data(images%mats(mat_row, mat_col)%col_blk_offset)
            CALL dbcsr_get_local_vrows(old_rdist, und_local_vrows, mat_vprow)
            CALL dbcsr_get_local_vcols(old_rdist, und_local_vcols, mat_vpcol)

            CALL dbcsr_get_local_vrows(new_rdist, dense_local_vrows, mat_vprow)
            CALL dbcsr_get_local_vcols(new_rdist, dense_local_vcols, mat_vpcol)
            ! The old matrix has to be remembered so it is copied to
            ! tmp_mat.
            old_meta(:) = images%mats(mat_row, mat_col)%index(1:dbcsr_meta_size)
            tmp_mat = dbcsr_type()
            tmp_mat = images%mats(mat_row, mat_col)
            images%mats(mat_row, mat_col) = dbcsr_type()
            CALL dbcsr_create(images%mats(mat_row, mat_col), template=new_template)
            images%mats(mat_row, mat_col)%index(dbcsr_slot_home_prow &
                                                :dbcsr_slot_home_vpcol) = &
               old_meta(dbcsr_slot_home_prow:dbcsr_slot_home_vpcol)
            CALL dbcsr_make_dense_low(tmp_mat, images%mats(mat_row, mat_col), &
                                      array_data(und_local_vrows), array_data(und_local_vcols), &
                                      und_row_blk_offsets, und_col_blk_offsets, &
                                      array_data(dense_local_vrows), &
                                      array_data(dense_local_vcols), &
                                      array_data(new_template%row_blk_offset), &
                                      array_data(new_template%col_blk_offset), &
                                      array_data(row_map), array_data(col_map), join_rows, join_cols)
            !
            CALL dbcsr_index_prune_deleted(images%mats(mat_row, mat_col))
            !
            CALL dbcsr_release(tmp_mat)
            IF (dbg) THEN
               cs = dbcsr_checksum(images%mats(mat_row, mat_col))
               WRITE (*, *) routineN//" cs pst", cs
            END IF
         END DO
      END DO
      CALL dbcsr_image_dist_release(images%image_dist)
      images%image_dist = new_rdist
      CALL dbcsr_image_dist_hold(images%image_dist)
      CALL timestop(handle)
   END SUBROUTINE dbcsr_make_images_dense

   SUBROUTINE multiply_cannon(left_set, right_set, product_matrix, &
                              retain_sparsity, &
                              filter_eps, flop, keep_product_data)
      !! Multiplies two DBCSR matrices

      TYPE(dbcsr_2d_array_type), POINTER                 :: left_set, right_set
         !! set of imaged left matrices
         !! set of imaged right matrices
      TYPE(dbcsr_type), INTENT(INOUT)                    :: product_matrix
         !! DBCSR product matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: retain_sparsity
         !! retain the sparsity of the existing product matrix; default is no
      REAL(kind=real_8), INTENT(in), OPTIONAL            :: filter_eps
      INTEGER(KIND=int_8), INTENT(OUT)                   :: flop
         !! effective flop
      LOGICAL, INTENT(IN)                                :: keep_product_data

      CHARACTER(len=*), PARAMETER :: routineN = 'multiply_cannon'
      INTEGER, PARAMETER                                 :: idata = 1, ileft = 0, imeta = 2, &
                                                            iright = 2

      INTEGER :: data_type, data_type_byte, handle, handle1, handle2, handle3, i, ithread, &
                 left_col_image, left_col_mult, left_col_nimages, left_dst_icol, left_dst_irow, &
                 left_dst_p, left_dst_pcol, left_dst_prow, left_dst_vcol, left_dst_vrow, left_max_nblks, &
                 left_max_nze, left_myfirstvcol, left_myfirstvrow, left_mypcol, left_myprow, left_npcols, &
                 left_nprows, left_recv_icol, left_recv_irow, left_recv_p, left_recv_pcol, left_recv_prow, &
                 left_recv_vcol, left_recv_vrow, left_row_image, left_row_mult, left_row_nimages, &
                 left_send_icol, left_send_irow, left_send_p, left_send_pcol, left_send_prow
      INTEGER :: left_send_vcol, left_send_vrow, left_src_icol, left_src_irow, left_src_p, &
                 left_src_pcol, left_src_prow, left_src_vcol, left_src_vrow, metronome, min_nimages, &
                 mynode, nblkrows_used, nsteps_k, nthreads, numnodes, nvirt_k, &
                 output_unit, right_col_image, right_col_mult, right_col_nimages, right_dst_icol, &
                 right_dst_irow, right_dst_p, right_dst_pcol, right_dst_prow, right_dst_vcol, &
                 right_dst_vrow, right_max_nblks, right_max_nze, right_myfirstvcol, right_myfirstvrow, &
                 right_mypcol, right_myprow, right_npcols, right_nprows, right_recv_icol, right_recv_irow
      INTEGER :: right_recv_p, right_recv_pcol, right_recv_prow, right_recv_vcol, right_recv_vrow, &
                 right_row_image, right_row_mult, right_row_nimages, right_send_icol, right_send_irow, &
                 right_send_p, right_send_pcol, right_send_prow, right_send_vcol, right_send_vrow, &
                 right_src_icol, right_src_irow, right_src_p, right_src_pcol, right_src_prow, &
                 right_src_vcol, right_src_vrow, row, size_guess, size_guess_init, stat, threads_finished, &
                 threads_finished_read, v_ki, v_ki_left, v_ki_right
      INTEGER(KIND=int_8)                                :: flop_single, flop_total, mem
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: row_counts, total_row_counts
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: left_sizes, my_sizes, right_sizes
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: all_sizes
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, &
                                                    enum2row_blk_sizes, &
                                                    left_index_rp, left_index_sp, m_sizes, n_sizes, &
                                                    right_index_rp, right_index_sp, &
                                                    row_blk_sizes2enum
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: k_sizes
      INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS      :: left_pgrid, product_pgrid, right_pgrid
      INTEGER, SAVE                                      :: mult_id = 0
      LOGICAL                                            :: keep_sparsity, list_indexing, &
                                                            otf_filtering

      REAL(kind=sp), ALLOCATABLE, DIMENSION(:) :: left_norms, right_norms, &
                                                  row_max_epss
      REAL(kind=sp)                            :: filter_eps_sp
      TYPE(dbcsr_2d_array_type), POINTER :: left_buffer_2, left_buffer_calc, &
                                            left_buffer_comm, right_buffer_2, right_buffer_calc, right_buffer_comm
      TYPE(dbcsr_data_obj)                     :: left_data_rp, left_data_sp, &
                                                  right_data_rp, right_data_sp
      TYPE(dbcsr_data_obj), POINTER            :: trs_stackbuf_calc, &
                                                  trs_stackbuf_comm
      TYPE(dbcsr_data_obj), TARGET             :: trs_stackbuf_1, trs_stackbuf_2
      TYPE(dbcsr_mm_multrec_type_p), DIMENSION(:), ALLOCATABLE :: multrec
      TYPE(dbcsr_mp_obj)                       :: left_mp_obj, product_mp_obj, &
                                                  right_mp_obj
      TYPE(mp_comm_type)                       :: grp, mp_group
      TYPE(mp_request_type), DIMENSION(:), ALLOCATABLE :: left_data_rr, left_data_sr, left_index_rr, &
                                                         left_index_sr, right_data_rr, right_data_sr, right_index_rr, right_index_sr

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      NULLIFY (trs_stackbuf_calc, trs_stackbuf_comm)
      NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes)
      NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes)
      NULLIFY (k_sizes)
      !
      ALLOCATE (left_buffer_2, right_buffer_2)
      mult_id = mult_id + 1

      IF (PRESENT(retain_sparsity)) THEN
         keep_sparsity = retain_sparsity
      ELSE
         keep_sparsity = .FALSE.
      END IF
      otf_filtering = PRESENT(filter_eps)

!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, nthreads, product_matrix)
!$OMP MASTER
      nthreads = 1
!$    nthreads = OMP_GET_NUM_THREADS()
      IF (.NOT. ASSOCIATED(product_matrix%wms)) &
         DBCSR_ABORT("Work matrices do not exist")
      IF (SIZE(product_matrix%wms) .NE. nthreads) &
         DBCSR_ABORT("Work matrices not correctly sized.")
      ALLOCATE (multrec(0:nthreads - 1))
!$OMP END MASTER
!$OMP END PARALLEL

      output_unit = default_output_unit
      flop_total = 0
      ! Set up variables
      data_type = dbcsr_get_data_type(product_matrix)
      data_type_byte = dbcsr_datatype_sizeof(data_type)
      left_row_nimages = left_set%image_dist%i%row_decimation
      left_row_mult = left_set%image_dist%i%row_multiplicity
      left_col_nimages = left_set%image_dist%i%col_decimation
      left_col_mult = left_set%image_dist%i%col_multiplicity
      right_row_nimages = right_set%image_dist%i%row_decimation
      right_row_mult = right_set%image_dist%i%row_multiplicity
      right_col_nimages = right_set%image_dist%i%col_decimation
      right_col_mult = right_set%image_dist%i%col_multiplicity
      left_mp_obj = dbcsr_distribution_mp(left_set%image_dist%i%main)
      right_mp_obj = dbcsr_distribution_mp(right_set%image_dist%i%main)
      product_mp_obj = dbcsr_distribution_mp(product_matrix%dist)
      numnodes = dbcsr_mp_numnodes(product_mp_obj)
      mynode = dbcsr_mp_mynode(product_mp_obj)
      left_nprows = dbcsr_mp_nprows(left_mp_obj)
      left_npcols = dbcsr_mp_npcols(left_mp_obj)
      left_myprow = dbcsr_mp_myprow(left_mp_obj)
      left_mypcol = dbcsr_mp_mypcol(left_mp_obj)
      left_myfirstvrow = dbcsr_mp_myprow(left_mp_obj)*left_row_nimages
      left_myfirstvcol = dbcsr_mp_mypcol(left_mp_obj)*left_col_nimages
      right_nprows = dbcsr_mp_nprows(right_mp_obj)
      right_npcols = dbcsr_mp_npcols(right_mp_obj)
      right_myprow = dbcsr_mp_myprow(right_mp_obj)
      right_mypcol = dbcsr_mp_mypcol(right_mp_obj)
      right_myfirstvrow = dbcsr_mp_myprow(right_mp_obj)*right_row_nimages
      right_myfirstvcol = dbcsr_mp_mypcol(right_mp_obj)*right_col_nimages
      mp_group = dbcsr_mp_group(product_mp_obj)
      left_pgrid => dbcsr_mp_pgrid(left_mp_obj)
      right_pgrid => dbcsr_mp_pgrid(right_mp_obj)
      product_pgrid => dbcsr_mp_pgrid(product_mp_obj)
      CALL dbcsr_mp_grid_setup(product_mp_obj)
      CALL dbcsr_mp_grid_setup(left_mp_obj)
      CALL dbcsr_mp_grid_setup(right_mp_obj)
      !
      ! Dummy checks
      ! left/right matching
      IF (left_col_nimages .NE. right_row_mult) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_mult .NE. right_row_nimages) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) &
         DBCSR_ABORT("Left/Right total mismatch")
      ! product/left matching
      IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_row_nimages*left_nprows) &
         DBCSR_ABORT("Product/Left total mismatch")
      ! product/left matching
      IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_col_nimages*right_npcols) &
         DBCSR_ABORT("Product/Right total mismatch")
      ! Limitations
      IF (left_row_nimages .NE. 1) &
         DBCSR_ABORT("Product/Left matrix process grid mismatch")
      IF (left_row_mult .NE. 1) &
         DBCSR_ABORT("Product/Left matrix process grid mismatch")
      IF (right_col_nimages .NE. 1) &
         DBCSR_ABORT("Product/Right matrix process grid mismatch")
      IF (right_col_mult .NE. 1) &
         DBCSR_ABORT("Product/Right matrix process grid mismatch")

      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_row_nimages*left_col_nimages)
      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages*right_col_nimages)
      !
      ! Exchange size data
      ALLOCATE (my_sizes(4, MAX(left_row_nimages, right_row_nimages), &
                         MAX(left_col_nimages, right_col_nimages)))
      my_sizes(:, :, :) = 0
      DO left_row_image = 1, left_row_nimages
         DO left_col_image = 1, left_col_nimages
            my_sizes(idata + ileft, left_row_image, left_col_image) &
               = dbcsr_data_get_size_referenced( &
                 left_set%mats(left_row_image, left_col_image)%data_area)
            my_sizes(imeta + ileft, left_row_image, left_col_image) = &
               left_set%mats(left_row_image, left_col_image)%index &
               (dbcsr_slot_size)
         END DO
      END DO

      DO right_row_image = 1, right_row_nimages
         DO right_col_image = 1, right_col_nimages
            my_sizes(idata + iright, right_row_image, right_col_image) &
               = dbcsr_data_get_size_referenced( &
                 right_set%mats(right_row_image, right_col_image)%data_area)
            my_sizes(imeta + iright, right_row_image, right_col_image) = &
               right_set%mats(right_row_image, right_col_image)%index &
               (dbcsr_slot_size)
         END DO
      END DO

      ALLOCATE (all_sizes(4, LBOUND(my_sizes, 2):UBOUND(my_sizes, 2), &
                          LBOUND(my_sizes, 3):UBOUND(my_sizes, 3), 0:numnodes - 1))
      CALL mp_allgather(my_sizes, all_sizes, mp_group)
      !
      ! Count the maximum possible multiplies per row for on-the-fly
      ! filtering.
      per_row_eps: IF (.NOT. otf_filtering) THEN
         ! These arrays must be valid when passed to called subroutines.
         ALLOCATE (left_norms(0), right_norms(0), row_max_epss(0), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory")
      ELSE
         IF (careful_mod) THEN
            IF (left_set%mats(1, 1)%bcsc) &
               DBCSR_ABORT("Can not do on-the-fly filtering with CSC-indexed matrices.")
         END IF
         IF (dbcsr_has_local_row_index(left_set%mats(1, 1))) THEN
            nblkrows_used = dbcsr_nblkrows_local(left_set%mats(1, 1))
         ELSE
            nblkrows_used = dbcsr_nblkrows_total(left_set%mats(1, 1))
         END IF
         ALLOCATE (row_max_epss(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left epsilons")
         ALLOCATE (row_counts(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left row counts")
         ! The summation could be done prow-locally but it would
         ! complicate the pre-row eps calculation.
         ALLOCATE (total_row_counts(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left row counts")
         ! Each prow member matrix (npcols * row_images) counts the
         ! blocks present in each of its rows.
         total_row_counts(:) = 0
         DO left_row_image = 1, left_row_nimages
            DO left_col_image = 1, left_col_nimages
               list_indexing = &
                  left_set%mats(left_row_image, left_col_image)%list_indexing
               IF (careful_mod) THEN
                  IF (list_indexing) THEN
                     IF ((left_set%mats(left_row_image, left_col_image)%nblks)*3 .NE. &
                         SIZE(left_set%mats(left_row_image, left_col_image)%coo_l)) &
                        DBCSR_ABORT("Row count mismatch")
                  ELSE
                     IF (nblkrows_used + 1 .NE. SIZE(left_set%mats(left_row_image, left_col_image)%row_p)) &
                        DBCSR_ABORT("Row count mismatch")
                  END IF
               END IF
               IF (list_indexing) THEN
                  CALL count_bins( &
                     left_set%mats(left_row_image, left_col_image)%nblks, &
                     left_set%mats(left_row_image, left_col_image)%coo_l(1::3), &
                     nblkrows_used, row_counts)
               ELSE
                  CALL dbcsr_count_row_index( &
                     left_set%mats(left_row_image, left_col_image)%row_p, &
                     row_counts, nblkrows_used)
               END IF
               total_row_counts(:) = total_row_counts(:) &
                                     + row_counts(:)
            END DO
         END DO
         ! The counted blocks are then summed up
         CALL mp_sum(total_row_counts, dbcsr_mp_my_row_group(product_mp_obj))
         ! and used to determine the maximum per-block epsilon.
         filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss))
!$OMP PARALLEL DO DEFAULT (NONE) &
!$OMP SHARED(nblkrows_used,row_max_epss,filter_eps_sp,&
!$OMP        total_row_counts)
         DO row = 1, nblkrows_used
            row_max_epss(row) &
               = (filter_eps_sp &
                  /REAL(MAX(1, total_row_counts(row)), KIND=KIND(row_max_epss)))**2
         END DO
!$OMP END PARALLEL DO
         !
         DEALLOCATE (row_counts)
         DEALLOCATE (total_row_counts)
      END IF per_row_eps
      !
      ! The main transfer loop goes through the virtual rows/columns.
      ! The number of steps may be smaller if the grid dimension is very
      ! non-optimal (both left column images and right row images are >
      ! 1).
      min_nimages = MIN(left_col_nimages, right_row_nimages)
      nvirt_k = left_npcols*left_col_nimages
      nsteps_k = nvirt_k/min_nimages
      !
      ! Translate the all_sizes to account for pre-distribution.  This
      ! is just done to simplify lookups.
      ALLOCATE (left_sizes(2, 0:left_nprows*left_row_nimages - 1, 0:nvirt_k - 1))
      left_sizes = -1
      DO left_src_vcol = 0, left_col_nimages*left_npcols - 1
         DO left_src_vrow = 0, left_row_nimages*left_nprows - 1
            ! Calculate what was shifted.  The left_src_v{row,col} are
            ! the "source" rows/columns; the left_dst are the shifted
            ! targets where the data was placed in make_images.
            CALL image_calculator(left_set%image_dist, &
                                  prow=left_dst_prow, pcol=left_dst_pcol, &
                                  rowi=left_dst_irow, coli=left_dst_icol, &
                                  myvprow=left_src_vrow, myvpcol=left_src_vcol, &
                                  shifting='l')
            left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol)
            left_sizes(idata, left_src_vrow, left_src_vcol) = &
               all_sizes( &
               idata + ileft, left_dst_irow, left_dst_icol, left_dst_p)
            left_sizes(imeta, left_src_vrow, left_src_vcol) = &
               all_sizes( &
               imeta + ileft, left_dst_irow, left_dst_icol, left_dst_p)
         END DO
      END DO
      !
      ALLOCATE (right_sizes(2, 0:nvirt_k - 1, 0:right_npcols*right_col_nimages - 1))
      right_sizes = -1
      DO right_src_vcol = 0, right_col_nimages*right_npcols - 1
         DO right_src_vrow = 0, right_row_nimages*right_nprows - 1
            ! Calculate what was shifted.  The right_src_v{row,col} are
            ! the "source" rows/columns; the right_dst are the shifted
            ! targets where the data was placed in make_images.
            CALL image_calculator(right_set%image_dist, &
                                  prow=right_dst_prow, pcol=right_dst_pcol, &
                                  rowi=right_dst_irow, coli=right_dst_icol, &
                                  myvprow=right_src_vrow, myvpcol=right_src_vcol, &
                                  shifting='r')
            right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol)
            right_sizes(idata, right_src_vrow, right_src_vcol) = &
               all_sizes( &
               idata + iright, right_dst_irow, right_dst_icol, right_dst_p)
            right_sizes(imeta, right_src_vrow, right_src_vcol) = &
               all_sizes( &
               imeta + iright, right_dst_irow, right_dst_icol, right_dst_p)
         END DO
      END DO
      !
      ! Setup product work areas
      left_max_nze = MAXVAL(all_sizes(idata + ileft, :, :, :))
      left_max_nblks = MAXVAL(all_sizes(imeta + ileft, :, :, :))
      right_max_nze = MAXVAL(all_sizes(idata + iright, :, :, :))
      right_max_nblks = MAXVAL(all_sizes(imeta + iright, :, :, :))
      !!
      ! Evaluate sizes for workspaces
      IF (.NOT. keep_sparsity) THEN
         IF (use_acc()) THEN
            size_guess_init = product_matrix_size_guess(left_set%mats(1, 1), right_set%mats(1, 1), product_matrix, &
                                                        left_max_nze, right_max_nze, &
                                                        left_col_nimages, right_row_nimages, &
                                                        nthreads)
         ELSE
            size_guess_init = 1
         END IF
      END IF
      ithread = 0
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP          PRIVATE (i, size_guess, ithread) &
!$OMP          SHARED (product_matrix, left_max_nze, right_max_nze) &
!$OMP          SHARED (left_set, right_set, &
!$OMP                 left_col_nimages, right_row_nimages) &
!$OMP          SHARED (nthreads, keep_sparsity, mynode, size_guess_init)
      !
!$    ithread = OMP_GET_THREAD_NUM()
      ! The work arrays have to be setup (actually, not quite sure).
      i = ithread + 1
      size_guess = product_matrix%wms(i)%datasize ! Should be minimal
      IF (.NOT. keep_sparsity) THEN
         size_guess = MAX(size_guess, size_guess_init)
      END IF
      CALL dbcsr_data_ensure_size(product_matrix%wms(i)%data_area, &
                                  size_guess)
      CALL dbcsr_data_set_size_referenced(product_matrix%wms(i)%data_area, &
                                          product_matrix%wms(i)%datasize)
      ! XXXXXXX a quick fix right now, allocation with size 1 might actually not be needed at all,
      !         but something expects this to be associated
      CALL ensure_array_size(product_matrix%wms(i)%row_i, ub=1)
      CALL ensure_array_size(product_matrix%wms(i)%col_i, ub=1)
      CALL ensure_array_size(product_matrix%wms(i)%blk_p, ub=1)
!$OMP END PARALLEL

      ! update capacity of memory-pools, +1 for the dense case
      IF (ASSOCIATED(memtype_abpanel_1%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, &
                                           capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1)
      IF (ASSOCIATED(memtype_abpanel_2%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, &
                                           capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1)
      IF (use_acc()) THEN
         ! enumerate the blocksizes to keep the following 2D-arrays small.
         CALL enumerate_blk_sizes(right_set%mats(1, 1)%row_blk_size%low%data, &
                                  dbcsr_max_row_size(right_set%mats(1, 1)), &
                                  row_blk_sizes2enum, enum2row_blk_sizes)
         CALL enumerate_blk_sizes(right_set%mats(1, 1)%col_blk_size%low%data, &
                                  dbcsr_max_col_size(right_set%mats(1, 1)), &
                                  col_blk_sizes2enum, enum2col_blk_sizes)
      END IF

      !
      ! Setup the left buffer matrices
      !
      CALL buffer_matrices_ensure_size(left_set, index_size=left_max_nblks, &
                                       data_size=left_max_nze)

      CALL setup_buffer_matrices(left_buffer_2, left_row_mult, left_col_nimages, &
                                 left_set%mats(1, 1), index_size=left_max_nblks, &
                                 data_size=left_max_nze)
      IF (otf_filtering) THEN
         ALLOCATE (left_norms(left_max_nblks), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left norms")
         IF (stat .NE. 0) otf_filtering = .FALSE.
      END IF
      left_buffer_calc => left_set
      left_buffer_comm => left_buffer_2
      ALLOCATE (left_data_sr(left_col_nimages))
      ALLOCATE (left_index_sr(left_col_nimages))
      ALLOCATE (left_data_rr(left_col_nimages))
      ALLOCATE (left_index_rr(left_col_nimages))
      left_data_sr = mp_request_null
      left_data_rr = mp_request_null
      left_index_sr = mp_request_null
      left_index_rr = mp_request_null

      ! Setup buffers for right matrix
      CALL buffer_matrices_ensure_size(right_set, index_size=right_max_nblks, &
                                       data_size=right_max_nze)

      CALL setup_buffer_matrices(right_buffer_2, right_row_nimages, right_col_mult, &
                                 right_set%mats(1, 1), index_size=right_max_nblks, data_size=right_max_nze)
      IF (otf_filtering) THEN
         ALLOCATE (right_norms(right_max_nblks), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_WARN("Could not allocate memory for right norms")
         IF (stat .NE. 0) otf_filtering = .FALSE.
      END IF
      right_buffer_calc => right_set
      right_buffer_comm => right_buffer_2
      ALLOCATE (right_data_sr(right_row_nimages))
      ALLOCATE (right_index_sr(right_row_nimages))
      ALLOCATE (right_data_rr(right_row_nimages))
      ALLOCATE (right_index_rr(right_row_nimages))
      right_data_sr = mp_request_null
      right_data_rr = mp_request_null
      right_index_sr = mp_request_null
      right_index_rr = mp_request_null
      !
      ALLOCATE (m_sizes(dbcsr_nblkrows_local(product_matrix)))
      CALL local_filter(array_data(product_matrix%row_blk_size), array_size(product_matrix%local_rows), &
                        array_data(product_matrix%local_rows), m_sizes)
      ALLOCATE (n_sizes(dbcsr_nblkcols_local(product_matrix)))
      CALL local_filter(array_data(product_matrix%col_blk_size), array_size(product_matrix%local_cols), &
                        array_data(product_matrix%local_cols), n_sizes)
      !
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP SHARED (left_buffer_comm, right_buffer_comm, product_matrix,&
!$OMP         keep_sparsity, filter_eps, row_max_epss, multrec, nthreads, &
!$OMP         right_data_sr, right_data_rr, left_data_sr, left_data_rr,&
!$OMP         right_index_sr, right_index_rr, left_index_sr, left_index_rr,&
!$OMP         m_sizes, n_sizes, keep_product_data), &
!$OMP PRIVATE(ithread)
      ithread = 0
!$    ithread = OMP_GET_THREAD_NUM()
      ALLOCATE (multrec(ithread)%p)
      CALL dbcsr_mm_multrec_init(multrec(ithread)%p, &
                                 product=product_matrix, &
                                 keep_sparsity=keep_sparsity, &
                                 eps=filter_eps, &
                                 row_max_epss=row_max_epss, &
                                 block_estimate=MAX(product_matrix%nblks, &
                                                    left_buffer_comm%mats(1, 1)%nblks, &
                                                    right_buffer_comm%mats(1, 1)%nblks)/nthreads, &
                                 right_row_blk_size=array_data(right_buffer_comm%mats(1, 1)%row_blk_size), &
                                 m_sizes=m_sizes, n_sizes=n_sizes, &
                                 keep_product_data=keep_product_data)
!$OMP END PARALLEL
      !
      ! Setup indexing
      CALL setup_rec_index_2d(left_set, left_row_nimages, left_col_nimages)
      CALL setup_rec_index_2d(right_set, right_row_nimages, right_col_nimages)
      !
      ! Setup the send/receive data pointers
      CALL dbcsr_data_init(left_data_sp)
      CALL dbcsr_data_init(left_data_rp)
      CALL dbcsr_data_init(right_data_sp)
      CALL dbcsr_data_init(right_data_rp)
      CALL dbcsr_data_new(left_data_sp, data_type)
      CALL dbcsr_data_new(left_data_rp, data_type)
      CALL dbcsr_data_new(right_data_sp, data_type)
      CALL dbcsr_data_new(right_data_rp, data_type)

      ! Setup transpose stackbuffers
      IF (use_acc()) THEN
         CALL dbcsr_data_init(trs_stackbuf_1)
         CALL dbcsr_data_init(trs_stackbuf_2)
         CALL dbcsr_data_new(trs_stackbuf_1, data_type=dbcsr_type_int_4, &
                             data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_1)
         CALL dbcsr_data_new(trs_stackbuf_2, data_type=dbcsr_type_int_4, &
                             data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_2)
         trs_stackbuf_calc => trs_stackbuf_1
         trs_stackbuf_comm => trs_stackbuf_2
      END IF
      !
      ! Reset indices for virtual images
      v_ki_right = 0
      v_ki_left = 0
      !
      ! Here is the main loop.
      !
      ! In the first loop iteration, the data is fetched from the
      ! sources. In the remaining iterations, the data are exchanged
      ! among neighbors.  In the last loop only calculations take place.
      !
      CALL timeset(routineN//"_loop", handle1)
      !
      grouped_k_index: DO metronome = 0, nvirt_k - 1
         ! Wait for right matrix transfer completion. Wait in all but
         ! the first loop iteration.
         CALL timeset(routineN//"_metrocomm1", handle2)
         wait_right: IF (v_ki_right .EQ. right_row_nimages) THEN
            ! Reset index
            v_ki_right = 0
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right"
            !
            CALL mp_waitall(right_data_sr)
            CALL mp_waitall(right_data_rr)
            CALL mp_waitall(right_index_sr)
            CALL mp_waitall(right_index_rr)
            !
            ! Repoint indices of right matrices
            DO v_ki = 0, right_row_nimages - 1
               CALL dbcsr_repoint_index(right_buffer_calc%mats(v_ki + 1, 1))
               right_buffer_calc%mats(v_ki + 1, 1)%valid = .TRUE.
            END DO
         END IF wait_right
         CALL timestop(handle2)
         !
         ! Right matrix transfer. Transfer in all but the last loop
         ! iteration.
         xfer_right: IF (v_ki_right .EQ. 0 .AND. metronome + right_row_nimages .LT. nvirt_k) THEN
            DO v_ki = 0, right_row_nimages - 1
               ! Calculate the process to send to.  It's the virtual
               ! process row -min_nimages up (i.e., smaller row number)
               ! from me.
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_send_prow, rowi=right_send_irow, & ! output
                                     pcol=right_send_pcol, coli=right_send_icol, & ! output
                                     vprow=right_send_vrow, vpcol=right_send_vcol, & ! output
                                     ! myvprow goes through all of my (process row) images
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, & ! nothing happens in the columns
                                     vprow_shift=-right_row_nimages, &
                                     shifting='0')
               ! Calculate which data I send.
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_dst_prow, rowi=right_dst_irow, &
                                     pcol=right_dst_pcol, coli=right_dst_icol, &
                                     vprow=right_dst_vrow, vpcol=right_dst_vcol, &
                                     ! myvprows goes through all of my (process row) images
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, & ! nothing happens in the columns
                                     vprow_shift=metronome, &
                                     ! This is with relative shifting.
                                     shifting='R')
               right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=right_data_sp, &
                  rsize=right_sizes(idata, right_dst_vrow, right_dst_vcol), &
                  csize=1, &
                  pointee=right_buffer_calc%mats(v_ki + 1, 1)%data_area)
               right_index_sp => right_buffer_calc%mats( &
                                 v_ki + 1, 1 &
                                 )%index(1: &
                                         right_sizes(imeta, right_dst_vrow, right_dst_vcol))
               !
               ! Calculate the process to receive from
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_recv_prow, rowi=right_recv_irow, &
                                     pcol=right_recv_pcol, coli=right_recv_icol, &
                                     vprow=right_recv_vrow, vpcol=right_recv_vcol, &
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, &
                                     vprow_shift=+right_row_nimages, & ! just the opposite as "send to"
                                     shifting='0')
               ! Calculate which data I receive
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_src_prow, rowi=right_src_irow, &
                                     pcol=right_src_pcol, coli=right_src_icol, &
                                     vprow=right_src_vrow, vpcol=right_src_vcol, &
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, &
                                     ! receive window moves with the metronome
                                     vprow_shift=metronome + right_row_nimages, &
                                     shifting='R')
               !
               IF (use_acc()) THEN
                  CALL timeset(routineN//"_acc_sync_right", handle3)
                  CALL acc_event_synchronize(right_buffer_comm%mats(v_ki + 1, 1)%data_area%d%acc_ready)
                  CALL timestop(handle3)
               END IF

               right_src_p = right_pgrid(right_src_prow, right_src_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=right_data_rp, &
                  rsize=right_sizes(idata, right_src_vrow, right_src_vcol), &
                  csize=1, &
                  pointee=right_buffer_comm%mats(v_ki + 1, 1)%data_area)
               right_index_rp => right_buffer_comm%mats( &
                                 v_ki + 1, 1 &
                                 )%index(1: &
                                         right_sizes(imeta, right_src_vrow, right_src_vcol))
               !
               right_send_p = right_pgrid(right_send_prow, right_send_pcol)
               right_recv_p = right_pgrid(right_recv_prow, right_recv_pcol)
               ! These are column-communicator relative
               IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN
                  right_send_p = right_send_prow
                  right_recv_p = right_recv_prow
                  grp = dbcsr_mp_my_col_group(right_mp_obj)
               ELSE
                  grp = dbcsr_mp_group(right_mp_obj)
               END IF
               !
               CALL timeset(routineN//"_metrocomm2", handle2)
               CALL dbcsr_irecv_any(right_data_rp, right_recv_p, &
                                    grp, right_data_rr(v_ki + 1), tag=right_src_vrow)
               CALL mp_irecv(right_index_rp, right_recv_p, &
                             grp, right_index_rr(v_ki + 1), tag=right_src_vrow)
               CALL dbcsr_isend_any(right_data_sp, right_send_p, &
                                    grp, right_data_sr(v_ki + 1), tag=right_dst_vrow)
               CALL mp_isend(right_index_sp, right_send_p, &
                             grp, right_index_sr(v_ki + 1), tag=right_dst_vrow)
               dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
               CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), &
                                         dbcsr_data_get_size(right_data_rp), &
                                         data_type_byte, &
                                         dbcsr_mpi_statistics%data_size_breakdown(:, :, 1))
               CALL timestop(handle2)
            END DO
         END IF xfer_right
         !
         ! Wait for left matrix transfer completion. Wait in all but
         ! the first loop iteration.
         CALL timeset(routineN//"_metrocomm3", handle2)
         wait_left: IF (v_ki_left .EQ. left_col_nimages) THEN
            ! Reset index
            v_ki_left = 0
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for left"
            CALL mp_waitall(left_data_sr)
            CALL mp_waitall(left_data_rr)
            CALL mp_waitall(left_index_sr)
            CALL mp_waitall(left_index_rr)
            !
            ! Repoint indices of left matrices
            DO v_ki = 0, left_col_nimages - 1
               CALL dbcsr_repoint_index(left_buffer_calc%mats(1, v_ki + 1))
               left_buffer_calc%mats(1, v_ki + 1)%valid = .TRUE.
            END DO
         END IF wait_left
         CALL timestop(handle2)
         !
         ! Left matrix transfer. Transfer in all but the last processor images.
         xfer_left: IF (v_ki_left .EQ. 0 .AND. metronome + left_col_nimages .LT. nvirt_k) THEN
            DO v_ki = 0, left_col_nimages - 1
               ! Calculate the process to send to.
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_send_prow, rowi=left_send_irow, & ! output
                                     pcol=left_send_pcol, coli=left_send_icol, & ! output
                                     vprow=left_send_vrow, vpcol=left_send_vcol, & ! output
                                     myvprow=left_myfirstvrow, & ! nothing happens in the rows
                                     ! go through all my column images
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     ! send to process left_col_nimages left in the grid
                                     vpcol_shift=-left_col_nimages, &
                                     shifting='0')
               ! Calculate which data I send.
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_dst_prow, rowi=left_dst_irow, &
                                     pcol=left_dst_pcol, coli=left_dst_icol, &
                                     vprow=left_dst_vrow, vpcol=left_dst_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     ! go through all my column images
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     vpcol_shift=metronome, &
                                     ! This is with relative shifting.
                                     shifting='L')
               !
               left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=left_data_sp, &
                  rsize=left_sizes(idata, left_dst_vrow, left_dst_vcol), &
                  csize=1, &
                  pointee=left_buffer_calc%mats(1, v_ki + 1)%data_area)
               left_index_sp => left_buffer_calc%mats( &
                                1, v_ki + 1 &
                                )%index(1: &
                                        left_sizes(imeta, left_dst_vrow, left_dst_vcol))
               !
               ! Calculate the process to receive from
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_recv_prow, rowi=left_recv_irow, &
                                     pcol=left_recv_pcol, coli=left_recv_icol, &
                                     vprow=left_recv_vrow, vpcol=left_recv_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     vpcol_shift=+left_col_nimages, & ! just the opposite as "send to"
                                     shifting='0')
               ! Calculate which data I receive
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_src_prow, rowi=left_src_irow, &
                                     pcol=left_src_pcol, coli=left_src_icol, &
                                     vprow=left_src_vrow, vpcol=left_src_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     ! receive window moves with the metronome
                                     vpcol_shift=metronome + left_col_nimages, &
                                     shifting='L')
               !
               IF (use_acc()) THEN
                  CALL timeset(routineN//"_acc_sync_left", handle3)
                  CALL acc_event_synchronize(left_buffer_comm%mats(1, v_ki + 1)%data_area%d%acc_ready)
                  CALL timestop(handle3)
               END IF

               left_src_p = left_pgrid(left_src_prow, left_src_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=left_data_rp, &
                  rsize=left_sizes(idata, left_src_vrow, left_src_vcol), &
                  csize=1, &
                  pointee=left_buffer_comm%mats(1, v_ki + 1)%data_area)
               left_index_rp => left_buffer_comm%mats( &
                                1, v_ki + 1 &
                                )%index(1: &
                                        left_sizes(imeta, left_src_vrow, left_src_vcol))
               !
               left_send_p = left_pgrid(left_send_prow, left_send_pcol)
               left_recv_p = left_pgrid(left_recv_prow, left_recv_pcol)
               ! These are column-communicator relative
               IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN
                  left_send_p = left_send_pcol
                  left_recv_p = left_recv_pcol
                  grp = dbcsr_mp_my_row_group(left_mp_obj)
               ELSE
                  grp = dbcsr_mp_group(left_mp_obj)
               END IF
               !
               CALL timeset(routineN//"_metrocomm4", handle2)
               CALL dbcsr_irecv_any(left_data_rp, left_recv_p, &
                                    grp, left_data_rr(v_ki + 1), tag=left_src_vcol)
               CALL mp_irecv(left_index_rp, left_recv_p, &
                             grp, left_index_rr(v_ki + 1), tag=left_src_vcol)
               CALL dbcsr_isend_any(left_data_sp, left_send_p, &
                                    grp, left_data_sr(v_ki + 1), tag=left_dst_vcol)
               CALL mp_isend(left_index_sp, left_send_p, &
                             grp, left_index_sr(v_ki + 1), tag=left_dst_vcol)
               dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
               CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(2, :), &
                                         dbcsr_data_get_size(left_data_rp), &
                                         data_type_byte, &
                                         dbcsr_mpi_statistics%data_size_breakdown(:, :, 2))
               CALL timestop(handle2)
            END DO
         END IF xfer_left
         !
         ! Do multiplication
         v_ki_left = v_ki_left + 1
         v_ki_right = v_ki_right + 1
         IF (debug_mod) THEN
            CALL dbcsr_print(left_buffer_calc%mats(1, v_ki_left), nodata=.TRUE.)
            CALL dbcsr_print(right_buffer_calc%mats(v_ki_right, 1), nodata=.TRUE.)
         END IF
         !
         ! from here the code for dbcsr_mm_driver_inner_init was taken
         !
         IF (.FALSE.) WRITE (*, *) routineN//" TICK", metronome
         ! Since the right matrix is shifted vertically, the
         ! received data always has different notions of "local
         ! rows".  Thus the local_rows and global_rows must be
         ! recalculated.
         CALL dbcsr_reset_vlocals(right_buffer_calc%mats(v_ki_right, 1), &
                                  right_set%image_dist)
         CALL dbcsr_reset_vlocals(left_buffer_calc%mats(1, v_ki_left), &
                                  left_set%image_dist)
         !
         CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows))
         CALL local_filter(array_data(right_buffer_calc%mats(v_ki_right, 1)%row_blk_size), &
                           array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows), &
                           array_data(right_buffer_calc%mats(v_ki_right, 1)%local_rows), &
                           k_sizes)
         !
         IF (use_acc()) THEN
            CALL dbcsr_data_host2dev(left_buffer_calc%mats(1, v_ki_left)%data_area)
            CALL dbcsr_data_host2dev(right_buffer_calc%mats(v_ki_right, 1)%data_area)
            CALL acc_transpose_blocks(right_buffer_calc%mats(v_ki_right, 1), trs_stackbuf_calc, &
                                      k_sizes, n_sizes, &
                                      row_blk_sizes2enum, enum2row_blk_sizes, &
                                      col_blk_sizes2enum, enum2col_blk_sizes)
         END IF

         ! Sets the local right-matrix columns
         IF (otf_filtering) THEN
            left_norms(:) = huge_norm
            right_norms(:) = huge_norm
            CALL calculate_norms(right_buffer_calc%mats(v_ki_right, 1), &
                                 right_norms, k_sizes, n_sizes)
            CALL calculate_norms(left_buffer_calc%mats(1, v_ki_left), &
                                 left_norms, m_sizes, k_sizes)
         END IF

         ! Wait for left and right buffers transfer to device before proceeding
         IF (use_acc()) THEN
            CALL timeset(routineN//"_sync_h2d", handle2)
            CALL acc_device_synchronize()
            CALL timestop(handle2)
         END IF
         !
         flop_single = 0
         threads_finished = 0

!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (left_buffer_calc, right_buffer_calc, &
!$OMP         v_ki_left, v_ki_right, handle2, handle3, &
!$OMP         product_matrix, multrec,&
!$OMP         filter_eps, right_norms, left_norms, row_max_epss, &
!$OMP         keep_sparsity,threads_finished, &
!$OMP         right_data_sr, right_data_rr, right_index_sr, right_index_rr, &
!$OMP         left_data_sr, left_data_rr, left_index_sr, left_index_rr, &
!$OMP         dbcsr_cfg, k_sizes, nvirt_k, metronome) &
!$OMP PRIVATE (ithread,nthreads,threads_finished_read) &
!$OMP REDUCTION (+: flop_single)
         ithread = 0; nthreads = 1
!$       ithread = omp_get_thread_num(); nthreads = omp_get_num_threads()

         CALL timeset(routineN//"_multrec", handle2)

         CALL dbcsr_mm_multrec_multiply(multrec(ithread)%p, &
                                        left=left_buffer_calc%mats(1, v_ki_left), &
                                        right=right_buffer_calc%mats(v_ki_right, 1), &
                                        flop=flop_single, &
                                        a_norms=left_norms, b_norms=right_norms, &
                                        k_sizes=k_sizes)

         IF (metronome == nvirt_k - 1) THEN
            CALL timeset(routineN//"_multrec_finalize", handle3)
            CALL dbcsr_mm_multrec_finalize(multrec(ithread)%p)
            DEALLOCATE (multrec(ithread)%p)
            CALL timestop(handle3)
         END IF

!$OMP ATOMIC
         threads_finished = threads_finished + 1
         IF (dbcsr_cfg%use_comm_thread%val .AND. (ithread .EQ. 0)) THEN
            DO
! requires OMP 3.1 (e.g. gcc >=4.7), for correctness, otherwise we keep fingers crossed
#if defined _OPENMP && _OPENMP >= 200711
!$OMP                 ATOMIC READ
#endif
               threads_finished_read = threads_finished
               IF (threads_finished_read .EQ. nthreads) EXIT
               CALL mp_testany(right_data_sr)
               CALL mp_testany(right_data_rr)
               CALL mp_testany(left_data_sr)
               CALL mp_testany(left_data_rr)
               CALL mp_testany(right_index_sr)
               CALL mp_testany(right_index_rr)
               CALL mp_testany(left_index_sr)
               CALL mp_testany(left_index_rr)
            END DO
         END IF
!$OMP BARRIER
         CALL timestop(handle2)

!$OMP END PARALLEL
         flop_total = flop_total + flop_single
         !
         ! Move to the next images
         IF (v_ki_left .EQ. left_col_nimages) THEN
            CALL dbcsr_switch(left_buffer_calc, left_buffer_comm)
         END IF
         IF (v_ki_right .EQ. right_row_nimages) THEN
            CALL dbcsr_switch(right_buffer_calc, right_buffer_comm)
            CALL dbcsr_switch(trs_stackbuf_calc, trs_stackbuf_comm)
         END IF

      END DO grouped_k_index
      CALL timestop(handle1)
      CALL m_memory(mem)
      max_memory = MAX(max_memory, REAL(mem))

      IF (use_acc()) THEN
         CALL dbcsr_data_release(trs_stackbuf_1)
         CALL dbcsr_data_release(trs_stackbuf_2)
         DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes)
         DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes)
      END IF

      IF (ALLOCATED(right_norms)) THEN
         DEALLOCATE (right_norms)
      END IF
      IF (ALLOCATED(left_norms)) THEN
         DEALLOCATE (left_norms)
      END IF
      IF (ALLOCATED(row_max_epss)) THEN
         DEALLOCATE (row_max_epss)
      END IF
      !
      CALL dbcsr_destroy_array(right_buffer_2)
      CALL dbcsr_destroy_array(left_buffer_2)
      DEALLOCATE (my_sizes)
      !
      CALL dbcsr_data_clear_pointer(left_data_sp)
      CALL dbcsr_data_clear_pointer(left_data_rp)
      CALL dbcsr_data_clear_pointer(right_data_sp)
      CALL dbcsr_data_clear_pointer(right_data_rp)
      CALL dbcsr_data_release(left_data_sp)
      CALL dbcsr_data_release(left_data_rp)
      CALL dbcsr_data_release(right_data_sp)
      CALL dbcsr_data_release(right_data_rp)
      !
      DEALLOCATE (left_data_rr, left_data_sr, left_index_rr, left_index_sr, &
                  right_data_rr, right_data_sr, right_index_rr, right_index_sr)
      !
      !
      IF (debug_mod) THEN
         v_ki = 0
         DO i = 1, SIZE(product_matrix%blk_p)
            v_ki = MAX(v_ki, ABS(product_matrix%blk_p(i)))
         END DO
         WRITE (*, *) routineN//" Actual final size", &
            LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), &
            LOG(REAL(v_ki))/LOG(10.0)
      END IF
      !
      flop = flop_total
      DEALLOCATE (left_buffer_2, right_buffer_2)
      DEALLOCATE (m_sizes, n_sizes)
      IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes)
      !
      CALL timestop(handle)
   END SUBROUTINE multiply_cannon

   SUBROUTINE multiply_cannon_g2g(left_set, right_set, product_matrix, &
                                  retain_sparsity, &
                                  filter_eps, flop, keep_product_data)
      !! Multiplies two DBCSR matrices
      !!
      !! This function is expected to be called only if __DBCSR_ACC_G2G
      !! is enabled and the data type is FP64.
      !!
      !! If __DBCSR_ACC is enabled, norms are calculated on the GPU and
      !! MPI calls reference buffers on the GPU device. Input matrices
      !! are copied from host to device only once. For the right matrix,
      !! transpose kernel is also called only once and the transposed
      !! matrix is transferred over MPI to neighbors.
      !!
      !! If __DBCSR_ACC is not enabled, all calculations are performed on
      !! the CPU and MPI calls reference host buffers.

      TYPE(dbcsr_2d_array_type), POINTER                 :: left_set, right_set
         !! set of imaged left matrices
         !! set of imaged right matrices
      TYPE(dbcsr_type), INTENT(INOUT)                    :: product_matrix
         !! DBCSR product matrix
      LOGICAL, INTENT(IN), OPTIONAL                      :: retain_sparsity
         !! retain the sparsity of the existing product matrix; default is no
      REAL(kind=real_8), INTENT(in), OPTIONAL            :: filter_eps
      INTEGER(KIND=int_8), INTENT(OUT)                   :: flop
         !! effective flop
      LOGICAL, INTENT(IN)                                :: keep_product_data

      CHARACTER(len=*), PARAMETER :: routineN = 'multiply_cannon'
      INTEGER, PARAMETER                                 :: idata = 1, ileft = 0, imeta = 2, &
                                                            iright = 2

      INTEGER :: data_type, data_type_byte, handle, handle1, handle2, handle3, i, ithread, &
                 left_col_image, left_col_mult, left_col_nimages, left_dst_icol, left_dst_irow, &
                 left_dst_p, left_dst_pcol, left_dst_prow, left_dst_vcol, left_dst_vrow, left_max_nblks, &
                 left_max_nze, left_myfirstvcol, left_myfirstvrow, left_mypcol, left_myprow, left_npcols, &
                 left_nprows, left_recv_icol, left_recv_irow, left_recv_p, left_recv_pcol, left_recv_prow, &
                 left_recv_vcol, left_recv_vrow, left_row_image, left_row_mult, left_row_nimages, &
                 left_send_icol, left_send_irow, left_send_p, left_send_pcol, left_send_prow
      INTEGER :: left_send_vcol, left_send_vrow, left_src_icol, left_src_irow, left_src_p, &
                 left_src_pcol, left_src_prow, left_src_vcol, left_src_vrow, metronome, min_nimages, &
                 mynode, nblkrows_used, nsteps_k, nthreads, numnodes, nvirt_k, &
                 output_unit, right_col_image, right_col_mult, right_col_nimages, right_dst_icol, &
                 right_dst_irow, right_dst_p, right_dst_pcol, right_dst_prow, right_dst_vcol, &
                 right_dst_vrow, right_max_nblks, right_max_nze, right_myfirstvcol, right_myfirstvrow, &
                 right_mypcol, right_myprow, right_npcols, right_nprows, right_recv_icol, right_recv_irow
      INTEGER :: right_recv_p, right_recv_pcol, right_recv_prow, right_recv_vcol, right_recv_vrow, &
                 right_row_image, right_row_mult, right_row_nimages, right_send_icol, right_send_irow, &
                 right_send_p, right_send_pcol, right_send_prow, right_send_vcol, right_send_vrow, &
                 right_src_icol, right_src_irow, right_src_p, right_src_pcol, right_src_prow, &
                 right_src_vcol, right_src_vrow, row, size_guess, size_guess_init, stat, threads_finished, &
                 threads_finished_read, v_ki, v_ki_left, v_ki_right, max_nblks
      INTEGER :: left_numnodes, right_numnodes, left_mynode, right_mynode
      INTEGER :: msglen
      INTEGER(KIND=int_8)                                :: flop_single, flop_total, mem
      INTEGER, ALLOCATABLE, DIMENSION(:)                 :: row_counts, total_row_counts
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :)           :: left_sizes, my_sizes, right_sizes
      INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :)        :: all_sizes
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, &
                                                    enum2row_blk_sizes, m_sizes, n_sizes, &
                                                    row_blk_sizes2enum, left_index_rp, left_index_sp, &
                                                    right_index_rp, right_index_sp
      INTEGER, DIMENSION(:), POINTER, CONTIGUOUS         :: k_sizes
      INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS      :: left_pgrid, product_pgrid, right_pgrid
      INTEGER, SAVE                                      :: mult_id = 0
      LOGICAL                                            :: keep_sparsity, list_indexing, &
                                                            otf_filtering
      LOGICAL                                            :: copy_left, copy_right

      REAL(kind=sp), ALLOCATABLE, DIMENSION(:) :: left_norms, right_norms, &
                                                  row_max_epss
      REAL(kind=sp)                            :: filter_eps_sp
      TYPE(dbcsr_2d_array_type), POINTER :: left_buffer_2, left_buffer_calc, &
                                            left_buffer_comm, right_buffer_2, right_buffer_calc, right_buffer_comm
      TYPE(dbcsr_data_obj)                     :: left_data_rp, left_data_sp, &
                                                  right_data_rp, right_data_sp
      TYPE(dbcsr_data_obj), POINTER            :: trs_stackbuf_calc, &
                                                  trs_stackbuf_comm
      TYPE(dbcsr_data_obj), TARGET             :: trs_stackbuf_1, trs_stackbuf_2
      TYPE(dbcsr_data_obj)                     :: normsbuf, offsetsbuf, nelemsbuf
      TYPE(dbcsr_mm_multrec_type_p), DIMENSION(:), ALLOCATABLE :: multrec
      TYPE(dbcsr_mp_obj)                       :: left_mp_obj, product_mp_obj, &
                                                  right_mp_obj
      TYPE(mp_comm_type)                       :: grp, left_grp, right_grp, mp_group
      TYPE(mp_request_type), DIMENSION(:), ALLOCATABLE :: left_data_rr, left_data_sr, left_index_rr, &
                                                         left_index_sr, right_data_rr, right_data_sr, right_index_rr, right_index_sr

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      NULLIFY (trs_stackbuf_calc, trs_stackbuf_comm)
      NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes)
      NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes)
      NULLIFY (k_sizes)
      !
      ALLOCATE (left_buffer_2, right_buffer_2)
      mult_id = mult_id + 1

      IF (PRESENT(retain_sparsity)) THEN
         keep_sparsity = retain_sparsity
      ELSE
         keep_sparsity = .FALSE.
      END IF
      otf_filtering = PRESENT(filter_eps)

!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (multrec, nthreads, product_matrix)
!$OMP MASTER
      nthreads = 1
!$    nthreads = OMP_GET_NUM_THREADS()
      IF (.NOT. ASSOCIATED(product_matrix%wms)) &
         DBCSR_ABORT("Work matrices do not exist")
      IF (SIZE(product_matrix%wms) .NE. nthreads) &
         DBCSR_ABORT("Work matrices not correctly sized.")
      ALLOCATE (multrec(0:nthreads - 1))
!$OMP END MASTER
!$OMP END PARALLEL

      output_unit = default_output_unit
      flop_total = 0
      ! Set up variables
      data_type = dbcsr_get_data_type(product_matrix)
      data_type_byte = dbcsr_datatype_sizeof(data_type)
      left_row_nimages = left_set%image_dist%i%row_decimation
      left_row_mult = left_set%image_dist%i%row_multiplicity
      left_col_nimages = left_set%image_dist%i%col_decimation
      left_col_mult = left_set%image_dist%i%col_multiplicity
      right_row_nimages = right_set%image_dist%i%row_decimation
      right_row_mult = right_set%image_dist%i%row_multiplicity
      right_col_nimages = right_set%image_dist%i%col_decimation
      right_col_mult = right_set%image_dist%i%col_multiplicity
      left_mp_obj = dbcsr_distribution_mp(left_set%image_dist%i%main)
      right_mp_obj = dbcsr_distribution_mp(right_set%image_dist%i%main)
      product_mp_obj = dbcsr_distribution_mp(product_matrix%dist)
      numnodes = dbcsr_mp_numnodes(product_mp_obj)
      mynode = dbcsr_mp_mynode(product_mp_obj)
      left_nprows = dbcsr_mp_nprows(left_mp_obj)
      left_npcols = dbcsr_mp_npcols(left_mp_obj)
      left_myprow = dbcsr_mp_myprow(left_mp_obj)
      left_mypcol = dbcsr_mp_mypcol(left_mp_obj)
      left_myfirstvrow = dbcsr_mp_myprow(left_mp_obj)*left_row_nimages
      left_myfirstvcol = dbcsr_mp_mypcol(left_mp_obj)*left_col_nimages
      right_nprows = dbcsr_mp_nprows(right_mp_obj)
      right_npcols = dbcsr_mp_npcols(right_mp_obj)
      right_myprow = dbcsr_mp_myprow(right_mp_obj)
      right_mypcol = dbcsr_mp_mypcol(right_mp_obj)
      right_myfirstvrow = dbcsr_mp_myprow(right_mp_obj)*right_row_nimages
      right_myfirstvcol = dbcsr_mp_mypcol(right_mp_obj)*right_col_nimages
      mp_group = dbcsr_mp_group(product_mp_obj)
      left_pgrid => dbcsr_mp_pgrid(left_mp_obj)
      right_pgrid => dbcsr_mp_pgrid(right_mp_obj)
      product_pgrid => dbcsr_mp_pgrid(product_mp_obj)
      CALL dbcsr_mp_grid_setup(product_mp_obj)
      CALL dbcsr_mp_grid_setup(left_mp_obj)
      CALL dbcsr_mp_grid_setup(right_mp_obj)
      !
      ! Dummy checks
      ! left/right matching
      IF (left_col_nimages .NE. right_row_mult) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_mult .NE. right_row_nimages) &
         DBCSR_ABORT("Left/Right image mismatch")
      IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) &
         DBCSR_ABORT("Left/Right total mismatch")
      ! product/left matching
      IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_row_nimages*left_nprows) &
         DBCSR_ABORT("Product/Left total mismatch")
      ! product/left matching
      IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_col_nimages*right_npcols) &
         DBCSR_ABORT("Product/Right total mismatch")
      ! Limitations
      IF (left_row_nimages .NE. 1) &
         DBCSR_ABORT("Product/Left matrix process grid mismatch")
      IF (left_row_mult .NE. 1) &
         DBCSR_ABORT("Product/Left matrix process grid mismatch")
      IF (right_col_nimages .NE. 1) &
         DBCSR_ABORT("Product/Right matrix process grid mismatch")
      IF (right_col_mult .NE. 1) &
         DBCSR_ABORT("Product/Right matrix process grid mismatch")

      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_row_nimages*left_col_nimages)
      dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages*right_col_nimages)
      !
      ! Exchange size data
      ALLOCATE (my_sizes(4, MAX(left_row_nimages, right_row_nimages), &
                         MAX(left_col_nimages, right_col_nimages)))
      my_sizes(:, :, :) = 0
      DO left_row_image = 1, left_row_nimages
         DO left_col_image = 1, left_col_nimages
            my_sizes(idata + ileft, left_row_image, left_col_image) &
               = dbcsr_data_get_size_referenced( &
                 left_set%mats(left_row_image, left_col_image)%data_area)
            my_sizes(imeta + ileft, left_row_image, left_col_image) = &
               left_set%mats(left_row_image, left_col_image)%index &
               (dbcsr_slot_size)
         END DO
      END DO

      DO right_row_image = 1, right_row_nimages
         DO right_col_image = 1, right_col_nimages
            my_sizes(idata + iright, right_row_image, right_col_image) &
               = dbcsr_data_get_size_referenced( &
                 right_set%mats(right_row_image, right_col_image)%data_area)
            my_sizes(imeta + iright, right_row_image, right_col_image) = &
               right_set%mats(right_row_image, right_col_image)%index &
               (dbcsr_slot_size)
         END DO
      END DO

      ALLOCATE (all_sizes(4, LBOUND(my_sizes, 2):UBOUND(my_sizes, 2), &
                          LBOUND(my_sizes, 3):UBOUND(my_sizes, 3), 0:numnodes - 1))
      CALL mp_allgather(my_sizes, all_sizes, mp_group)
      !
      ! Count the maximum possible multiplies per row for on-the-fly
      ! filtering.
      per_row_eps: IF (.NOT. otf_filtering) THEN
         ! These arrays must be valid when passed to called subroutines.
         ALLOCATE (left_norms(0), right_norms(0), row_max_epss(0), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory")
      ELSE
         IF (careful_mod) THEN
            IF (left_set%mats(1, 1)%bcsc) &
               DBCSR_ABORT("Can not do on-the-fly filtering with CSC-indexed matrices.")
         END IF
         IF (dbcsr_has_local_row_index(left_set%mats(1, 1))) THEN
            nblkrows_used = dbcsr_nblkrows_local(left_set%mats(1, 1))
         ELSE
            nblkrows_used = dbcsr_nblkrows_total(left_set%mats(1, 1))
         END IF
         ALLOCATE (row_max_epss(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left epsilons")
         ALLOCATE (row_counts(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left row counts")
         ! The summation could be done prow-locally but it would
         ! complicate the pre-row eps calculation.
         ALLOCATE (total_row_counts(nblkrows_used), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left row counts")
         ! Each prow member matrix (npcols * row_images) counts the
         ! blocks present in each of its rows.
         total_row_counts(:) = 0
         DO left_row_image = 1, left_row_nimages
            DO left_col_image = 1, left_col_nimages
               list_indexing = &
                  left_set%mats(left_row_image, left_col_image)%list_indexing
               IF (careful_mod) THEN
                  IF (list_indexing) THEN
                     IF ((left_set%mats(left_row_image, left_col_image)%nblks)*3 .NE. &
                         SIZE(left_set%mats(left_row_image, left_col_image)%coo_l)) &
                        DBCSR_ABORT("Row count mismatch")
                  ELSE
                     IF (nblkrows_used + 1 .NE. SIZE(left_set%mats(left_row_image, left_col_image)%row_p)) &
                        DBCSR_ABORT("Row count mismatch")
                  END IF
               END IF
               IF (list_indexing) THEN
                  CALL count_bins( &
                     left_set%mats(left_row_image, left_col_image)%nblks, &
                     left_set%mats(left_row_image, left_col_image)%coo_l(1::3), &
                     nblkrows_used, row_counts)
               ELSE
                  CALL dbcsr_count_row_index( &
                     left_set%mats(left_row_image, left_col_image)%row_p, &
                     row_counts, nblkrows_used)
               END IF
               total_row_counts(:) = total_row_counts(:) &
                                     + row_counts(:)
            END DO
         END DO
         ! The counted blocks are then summed up
         CALL mp_sum(total_row_counts, dbcsr_mp_my_row_group(product_mp_obj))
         ! and used to determine the maximum per-block epsilon.
         filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss))
!$OMP PARALLEL DO DEFAULT (NONE) &
!$OMP SHARED(nblkrows_used,row_max_epss,filter_eps_sp,&
!$OMP        total_row_counts)
         DO row = 1, nblkrows_used
            row_max_epss(row) &
               = (filter_eps_sp &
                  /REAL(MAX(1, total_row_counts(row)), KIND=KIND(row_max_epss)))**2
         END DO
!$OMP END PARALLEL DO
         !
         DEALLOCATE (row_counts)
         DEALLOCATE (total_row_counts)
      END IF per_row_eps
      !
      ! The main transfer loop goes through the virtual rows/columns.
      ! The number of steps may be smaller if the grid dimension is very
      ! non-optimal (both left column images and right row images are >
      ! 1).
      min_nimages = MIN(left_col_nimages, right_row_nimages)
      nvirt_k = left_npcols*left_col_nimages
      nsteps_k = nvirt_k/min_nimages
      !
      ! Translate the all_sizes to account for pre-distribution.  This
      ! is just done to simplify lookups.
      ALLOCATE (left_sizes(2, 0:left_nprows*left_row_nimages - 1, 0:nvirt_k - 1))
      left_sizes = -1
      DO left_src_vcol = 0, left_col_nimages*left_npcols - 1
         DO left_src_vrow = 0, left_row_nimages*left_nprows - 1
            ! Calculate what was shifted.  The left_src_v{row,col} are
            ! the "source" rows/columns; the left_dst are the shifted
            ! targets where the data was placed in make_images.
            CALL image_calculator(left_set%image_dist, &
                                  prow=left_dst_prow, pcol=left_dst_pcol, &
                                  rowi=left_dst_irow, coli=left_dst_icol, &
                                  myvprow=left_src_vrow, myvpcol=left_src_vcol, &
                                  shifting='l')
            left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol)
            left_sizes(idata, left_src_vrow, left_src_vcol) = &
               all_sizes( &
               idata + ileft, left_dst_irow, left_dst_icol, left_dst_p)
            left_sizes(imeta, left_src_vrow, left_src_vcol) = &
               all_sizes( &
               imeta + ileft, left_dst_irow, left_dst_icol, left_dst_p)
         END DO
      END DO
      !
      ALLOCATE (right_sizes(2, 0:nvirt_k - 1, 0:right_npcols*right_col_nimages - 1))
      right_sizes = -1
      DO right_src_vcol = 0, right_col_nimages*right_npcols - 1
         DO right_src_vrow = 0, right_row_nimages*right_nprows - 1
            ! Calculate what was shifted.  The right_src_v{row,col} are
            ! the "source" rows/columns; the right_dst are the shifted
            ! targets where the data was placed in make_images.
            CALL image_calculator(right_set%image_dist, &
                                  prow=right_dst_prow, pcol=right_dst_pcol, &
                                  rowi=right_dst_irow, coli=right_dst_icol, &
                                  myvprow=right_src_vrow, myvpcol=right_src_vcol, &
                                  shifting='r')
            right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol)
            right_sizes(idata, right_src_vrow, right_src_vcol) = &
               all_sizes( &
               idata + iright, right_dst_irow, right_dst_icol, right_dst_p)
            right_sizes(imeta, right_src_vrow, right_src_vcol) = &
               all_sizes( &
               imeta + iright, right_dst_irow, right_dst_icol, right_dst_p)
         END DO
      END DO
      !
      ! Setup product work areas
      left_max_nze = MAXVAL(all_sizes(idata + ileft, :, :, :))
      left_max_nblks = MAXVAL(all_sizes(imeta + ileft, :, :, :))
      right_max_nze = MAXVAL(all_sizes(idata + iright, :, :, :))
      right_max_nblks = MAXVAL(all_sizes(imeta + iright, :, :, :))
      !!
      ! Evaluate sizes for workspaces
      IF (.NOT. keep_sparsity) THEN
         IF (use_acc()) THEN
            size_guess_init = product_matrix_size_guess(left_set%mats(1, 1), right_set%mats(1, 1), product_matrix, &
                                                        left_max_nze, right_max_nze, &
                                                        left_col_nimages, right_row_nimages, &
                                                        nthreads)
         ELSE
            size_guess_init = 1
         END IF
      END IF
      ithread = 0
!$OMP PARALLEL DEFAULT(NONE) &
!$OMP          PRIVATE (i, size_guess, ithread) &
!$OMP          SHARED (product_matrix, left_max_nze, right_max_nze) &
!$OMP          SHARED (left_set, right_set, &
!$OMP                 left_col_nimages, right_row_nimages) &
!$OMP          SHARED (nthreads, keep_sparsity, mynode, size_guess_init)
      !
!$    ithread = OMP_GET_THREAD_NUM()
      ! The work arrays have to be setup (actually, not quite sure).
      i = ithread + 1
      size_guess = product_matrix%wms(i)%datasize ! Should be minimal
      IF (.NOT. keep_sparsity) THEN
         size_guess = MAX(size_guess, size_guess_init)
      END IF
      CALL dbcsr_data_ensure_size(product_matrix%wms(i)%data_area, &
                                  size_guess)
      CALL dbcsr_data_set_size_referenced(product_matrix%wms(i)%data_area, &
                                          product_matrix%wms(i)%datasize)
      ! XXXXXXX a quick fix right now, allocation with size 1 might actually not be needed at all,
      !         but something expects this to be associated
      CALL ensure_array_size(product_matrix%wms(i)%row_i, ub=1)
      CALL ensure_array_size(product_matrix%wms(i)%col_i, ub=1)
      CALL ensure_array_size(product_matrix%wms(i)%blk_p, ub=1)
!$OMP END PARALLEL

      ! update capacity of memory-pools, +1 for the dense case
      IF (ASSOCIATED(memtype_abpanel_1%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, &
                                           capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1)
      IF (ASSOCIATED(memtype_abpanel_2%pool)) &
         CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, &
                                           capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1)
      IF (use_acc()) THEN
         ! enumerate the blocksizes to keep the following 2D-arrays small.
         CALL enumerate_blk_sizes(right_set%mats(1, 1)%row_blk_size%low%data, &
                                  dbcsr_max_row_size(right_set%mats(1, 1)), &
                                  row_blk_sizes2enum, enum2row_blk_sizes)
         CALL enumerate_blk_sizes(right_set%mats(1, 1)%col_blk_size%low%data, &
                                  dbcsr_max_col_size(right_set%mats(1, 1)), &
                                  col_blk_sizes2enum, enum2col_blk_sizes)
      END IF

      ! Save col and row communicators
      IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN
         right_grp = dbcsr_mp_my_col_group(right_mp_obj)
      ELSE
         right_grp = dbcsr_mp_group(right_mp_obj)
      END IF
      IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN
         left_grp = dbcsr_mp_my_row_group(left_mp_obj)
      ELSE
         left_grp = dbcsr_mp_group(left_mp_obj)
      END IF
      CALL mp_environ(left_numnodes, left_mynode, left_grp)
      CALL mp_environ(right_numnodes, right_mynode, right_grp)

      !
      ! Setup the left buffer matrices
      !
      CALL buffer_matrices_ensure_size(left_set, index_size=left_max_nblks, &
                                       data_size=left_max_nze)

      CALL setup_buffer_matrices(left_buffer_2, left_row_mult, left_col_nimages, &
                                 left_set%mats(1, 1), index_size=left_max_nblks, &
                                 data_size=left_max_nze)
      IF (otf_filtering) THEN
         ALLOCATE (left_norms(left_max_nblks), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_ABORT("Could not allocate memory for left norms")
         IF (stat .NE. 0) otf_filtering = .FALSE.
      END IF
      left_buffer_calc => left_set
      left_buffer_comm => left_buffer_2
      ALLOCATE (left_data_sr(left_col_nimages))
      ALLOCATE (left_index_sr(left_col_nimages))
      ALLOCATE (left_data_rr(left_col_nimages))
      ALLOCATE (left_index_rr(left_col_nimages))
      left_data_sr = mp_request_null
      left_data_rr = mp_request_null
      left_index_sr = mp_request_null
      left_index_rr = mp_request_null

      ! Setup buffers for right matrix
      CALL buffer_matrices_ensure_size(right_set, index_size=right_max_nblks, &
                                       data_size=right_max_nze)

      CALL setup_buffer_matrices(right_buffer_2, right_row_nimages, right_col_mult, &
                                 right_set%mats(1, 1), index_size=right_max_nblks, data_size=right_max_nze)
      IF (otf_filtering) THEN
         ALLOCATE (right_norms(right_max_nblks), stat=stat)
         IF (stat .NE. 0) &
            DBCSR_WARN("Could not allocate memory for right norms")
         IF (stat .NE. 0) otf_filtering = .FALSE.

      END IF
      IF (use_acc() .and. otf_filtering) THEN
         max_nblks = MAX(left_max_nblks, right_max_nblks)
         CALL dbcsr_data_init(normsbuf)
         CALL dbcsr_data_new(normsbuf, data_type=dbcsr_type_real_4, &
                             data_size=max_nblks, memory_type=memtype_normsbuf)
         CALL dbcsr_data_init(offsetsbuf)
         CALL dbcsr_data_new(offsetsbuf, data_type=dbcsr_type_int_4, &
                             data_size=max_nblks, memory_type=memtype_offsetsbuf)
         CALL dbcsr_data_init(nelemsbuf)
         CALL dbcsr_data_new(nelemsbuf, data_type=dbcsr_type_int_4, &
                             data_size=max_nblks, memory_type=memtype_nelemsbuf)
      END IF
      right_buffer_calc => right_set
      right_buffer_comm => right_buffer_2
      ALLOCATE (right_data_sr(right_row_nimages))
      ALLOCATE (right_index_sr(right_row_nimages))
      ALLOCATE (right_data_rr(right_row_nimages))
      ALLOCATE (right_index_rr(right_row_nimages))
      right_data_sr = mp_request_null
      right_data_rr = mp_request_null
      right_index_sr = mp_request_null
      right_index_rr = mp_request_null
      !
      ALLOCATE (m_sizes(dbcsr_nblkrows_local(product_matrix)))
      CALL local_filter(array_data(product_matrix%row_blk_size), array_size(product_matrix%local_rows), &
                        array_data(product_matrix%local_rows), m_sizes)
      ALLOCATE (n_sizes(dbcsr_nblkcols_local(product_matrix)))
      CALL local_filter(array_data(product_matrix%col_blk_size), array_size(product_matrix%local_cols), &
                        array_data(product_matrix%local_cols), n_sizes)
      !
!$OMP PARALLEL &
!$OMP DEFAULT (NONE) &
!$OMP SHARED (left_buffer_comm, right_buffer_comm, product_matrix,&
!$OMP         keep_sparsity, filter_eps, row_max_epss, multrec, nthreads, &
!$OMP         right_data_sr, right_data_rr, left_data_sr, left_data_rr,&
!$OMP         right_index_sr, right_index_rr, left_index_sr, left_index_rr,&
!$OMP         m_sizes, n_sizes, keep_product_data), &
!$OMP PRIVATE(ithread)
      ithread = 0
!$    ithread = OMP_GET_THREAD_NUM()
      ALLOCATE (multrec(ithread)%p)
      CALL dbcsr_mm_multrec_init(multrec(ithread)%p, &
                                 product=product_matrix, &
                                 keep_sparsity=keep_sparsity, &
                                 eps=filter_eps, &
                                 row_max_epss=row_max_epss, &
                                 block_estimate=MAX(product_matrix%nblks, &
                                                    left_buffer_comm%mats(1, 1)%nblks, &
                                                    right_buffer_comm%mats(1, 1)%nblks)/nthreads, &
                                 right_row_blk_size=array_data(right_buffer_comm%mats(1, 1)%row_blk_size), &
                                 m_sizes=m_sizes, n_sizes=n_sizes, &
                                 keep_product_data=keep_product_data)
!$OMP END PARALLEL
      !
      ! Setup indexing
      CALL setup_rec_index_2d(left_set, left_row_nimages, left_col_nimages)
      CALL setup_rec_index_2d(right_set, right_row_nimages, right_col_nimages)
      !
      ! Setup the send/receive data pointers
      CALL dbcsr_data_init(left_data_sp)
      CALL dbcsr_data_init(left_data_rp)
      CALL dbcsr_data_init(right_data_sp)
      CALL dbcsr_data_init(right_data_rp)
      CALL dbcsr_data_new(left_data_sp, data_type)
      CALL dbcsr_data_new(left_data_rp, data_type)
      CALL dbcsr_data_new(right_data_sp, data_type)
      CALL dbcsr_data_new(right_data_rp, data_type)

      ! Setup transpose stackbuffers
      IF (use_acc()) THEN
         CALL dbcsr_data_init(trs_stackbuf_1)
         CALL dbcsr_data_init(trs_stackbuf_2)
         CALL dbcsr_data_new(trs_stackbuf_1, data_type=dbcsr_type_int_4, &
                             data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_1)
         CALL dbcsr_data_new(trs_stackbuf_2, data_type=dbcsr_type_int_4, &
                             data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_2)
         trs_stackbuf_calc => trs_stackbuf_1
         trs_stackbuf_comm => trs_stackbuf_2
      END IF
      !
      ! Reset indices for virtual images
      v_ki_right = 0
      v_ki_left = 0
      !
      ! Here is the main loop.
      !
      ! In the first loop iteration, the data is fetched from the
      ! sources. In the remaining iterations, the data are exchanged
      ! among neighbors.  In the last loop only calculations take place.
      !
      CALL timeset(routineN//"_loop", handle1)
      copy_left = .true.
      copy_right = .true.
      !
      grouped_k_index: DO metronome = 0, nvirt_k - 1
         ! Wait for right matrix transfer completion. Wait in all but
         ! the first loop iteration.
         CALL timeset(routineN//"_metrocomm1", handle2)
         wait_right: IF (v_ki_right .EQ. right_row_nimages) THEN
            ! Reset index
            v_ki_right = 0
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right"
            !
            CALL mp_waitall(right_data_sr)
            CALL mp_waitall(right_data_rr)
            CALL mp_waitall(right_index_sr)
            CALL mp_waitall(right_index_rr)
            !
            ! Repoint indices of right matrices
            DO v_ki = 0, right_row_nimages - 1
               CALL dbcsr_repoint_index(right_buffer_calc%mats(v_ki + 1, 1))
               right_buffer_calc%mats(v_ki + 1, 1)%valid = .TRUE.
            END DO
         END IF wait_right
         CALL timestop(handle2)
         !
         ! Wait for left matrix transfer completion. Wait in all but
         ! the first loop iteration.
         CALL timeset(routineN//"_metrocomm3", handle2)
         wait_left: IF (v_ki_left .EQ. left_col_nimages) THEN
            ! Reset index
            v_ki_left = 0
            IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for left"
            CALL mp_waitall(left_data_sr)
            CALL mp_waitall(left_data_rr)
            CALL mp_waitall(left_index_sr)
            CALL mp_waitall(left_index_rr)
            !
            ! Repoint indices of left matrices
            DO v_ki = 0, left_col_nimages - 1
               CALL dbcsr_repoint_index(left_buffer_calc%mats(1, v_ki + 1))
               left_buffer_calc%mats(1, v_ki + 1)%valid = .TRUE.
            END DO
         END IF wait_left
         CALL timestop(handle2)

         v_ki_left = v_ki_left + 1
         v_ki_right = v_ki_right + 1

         IF (debug_mod) THEN
            CALL dbcsr_print(left_buffer_calc%mats(1, v_ki_left), nodata=.TRUE.)
            CALL dbcsr_print(right_buffer_calc%mats(v_ki_right, 1), nodata=.TRUE.)
         END IF
         !
         ! from here the code for dbcsr_mm_driver_inner_init was taken
         !
         IF (.FALSE.) WRITE (*, *) routineN//" TICK", metronome
         ! Since the right matrix is shifted vertically, the
         ! received data always has different notions of "local
         ! rows".  Thus the local_rows and global_rows must be
         ! recalculated.
         CALL dbcsr_reset_vlocals(right_buffer_calc%mats(v_ki_right, 1), &
                                  right_set%image_dist)
         CALL dbcsr_reset_vlocals(left_buffer_calc%mats(1, v_ki_left), &
                                  left_set%image_dist)
         !
         CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows))
         CALL local_filter(array_data(right_buffer_calc%mats(v_ki_right, 1)%row_blk_size), &
                           array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows), &
                           array_data(right_buffer_calc%mats(v_ki_right, 1)%local_rows), &
                           k_sizes)
         !
         ! Transfer left and right buffers from host to GPU memory
         IF (use_acc()) THEN
            IF (copy_left) THEN
               ! copy left buffer images to device
               DO v_ki = 1, left_col_nimages
                  CALL dbcsr_data_host2dev(left_buffer_calc%mats(1, v_ki)%data_area)
                  CALL timeset(routineN//"_sync_h2d", handle2)
                  CALL acc_stream_synchronize(left_buffer_calc%mats(1, v_ki)%data_area%d%memory_type%acc_stream)
                  CALL timestop(handle2)
               END DO
               copy_left = .false.
            END IF
            ! calculate norms for matrices in left buffer
            IF (otf_filtering) THEN
               left_norms(:) = huge_norm
               CALL acc_calculate_norms(left_buffer_calc%mats(1, v_ki_left), &
                                        left_norms, normsbuf, offsetsbuf, nelemsbuf, m_sizes, k_sizes)
            END IF

            IF (copy_right) THEN
               ! copy right buffer images to device
               DO v_ki = 1, right_row_nimages
                  CALL dbcsr_data_host2dev(right_buffer_calc%mats(v_ki, 1)%data_area)
                  CALL timeset(routineN//"_sync_h2d", handle2)
                  CALL acc_stream_synchronize(right_buffer_calc%mats(v_ki, 1)%data_area%d%memory_type%acc_stream)
                  CALL timestop(handle2)

                  ! now transpose right buffer image
                  CALL acc_transpose_blocks(right_buffer_calc%mats(v_ki, 1), trs_stackbuf_calc, &
                                            k_sizes, n_sizes, &
                                            row_blk_sizes2enum, enum2row_blk_sizes, &
                                            col_blk_sizes2enum, enum2col_blk_sizes)
               END DO
               ! Wait for transpose to complete before proceeding
               CALL timeset(routineN//"_sync_h2d", handle2)
               CALL acc_stream_synchronize(trs_stackbuf_calc%d%memory_type%acc_stream)
               CALL timestop(handle2)
               copy_right = .false.
            END IF
            ! calculate norms for matrices in right buffer
            IF (otf_filtering) THEN
               right_norms(:) = huge_norm
               CALL acc_calculate_norms(right_buffer_calc%mats(v_ki_right, 1), &
                                        right_norms, normsbuf, offsetsbuf, nelemsbuf, k_sizes, n_sizes)
            END IF
         END IF
         !
         ! Right matrix transfer. Transfer in all but the last loop
         ! iteration.
         xfer_right: IF (v_ki_right .EQ. 1 .AND. metronome + right_row_nimages .LT. nvirt_k) THEN
            DO v_ki = 0, right_row_nimages - 1
               ! Calculate the process to send to.  It's the virtual
               ! process row -min_nimages up (i.e., smaller row number)
               ! from me.
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_send_prow, rowi=right_send_irow, & ! output
                                     pcol=right_send_pcol, coli=right_send_icol, & ! output
                                     vprow=right_send_vrow, vpcol=right_send_vcol, & ! output
                                     ! myvprow goes through all of my (process row) images
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, & ! nothing happens in the columns
                                     vprow_shift=-right_row_nimages, &
                                     shifting='0')
               ! Calculate which data I send.
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_dst_prow, rowi=right_dst_irow, &
                                     pcol=right_dst_pcol, coli=right_dst_icol, &
                                     vprow=right_dst_vrow, vpcol=right_dst_vcol, &
                                     ! myvprows goes through all of my (process row) images
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, & ! nothing happens in the columns
                                     vprow_shift=metronome, &
                                     ! This is with relative shifting.
                                     shifting='R')
               right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=right_data_sp, &
                  rsize=right_sizes(idata, right_dst_vrow, right_dst_vcol), &
                  csize=1, &
                  pointee=right_buffer_calc%mats(v_ki + 1, 1)%data_area)
               right_index_sp => right_buffer_calc%mats( &
                                 v_ki + 1, 1 &
                                 )%index(1: &
                                         right_sizes(imeta, right_dst_vrow, right_dst_vcol))
               !
               ! Calculate the process to receive from
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_recv_prow, rowi=right_recv_irow, &
                                     pcol=right_recv_pcol, coli=right_recv_icol, &
                                     vprow=right_recv_vrow, vpcol=right_recv_vcol, &
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, &
                                     vprow_shift=+right_row_nimages, & ! just the opposite as "send to"
                                     shifting='0')
               ! Calculate which data I receive
               CALL image_calculator(right_set%image_dist, &
                                     prow=right_src_prow, rowi=right_src_irow, &
                                     pcol=right_src_pcol, coli=right_src_icol, &
                                     vprow=right_src_vrow, vpcol=right_src_vcol, &
                                     myvprow=v_ki + right_myfirstvrow, &
                                     myvpcol=right_myfirstvcol, &
                                     ! receive window moves with the metronome
                                     vprow_shift=metronome + right_row_nimages, &
                                     shifting='R')
               !
               IF (use_acc()) THEN
                  CALL timeset(routineN//"_acc_sync_right", handle3)
                  CALL acc_event_synchronize(right_buffer_comm%mats(v_ki + 1, 1)%data_area%d%acc_ready)
                  CALL timestop(handle3)
               END IF

               right_src_p = right_pgrid(right_src_prow, right_src_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=right_data_rp, &
                  rsize=right_sizes(idata, right_src_vrow, right_src_vcol), &
                  csize=1, &
                  pointee=right_buffer_comm%mats(v_ki + 1, 1)%data_area)
               right_index_rp => right_buffer_comm%mats( &
                                 v_ki + 1, 1 &
                                 )%index(1: &
                                         right_sizes(imeta, right_src_vrow, right_src_vcol))
               !
               right_send_p = right_pgrid(right_send_prow, right_send_pcol)
               right_recv_p = right_pgrid(right_recv_prow, right_recv_pcol)
               ! These are column-communicator relative
               IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN
                  right_send_p = right_send_prow
                  right_recv_p = right_recv_prow
                  grp = dbcsr_mp_my_col_group(right_mp_obj)
               ELSE
                  grp = dbcsr_mp_group(right_mp_obj)
               END IF
               !
               CALL timeset(routineN//"_metrocomm2", handle2)
               IF (.not. use_acc()) THEN
                  CALL dbcsr_irecv_any(right_data_rp, right_recv_p, &
                                       grp, right_data_rr(v_ki + 1), tag=right_src_vrow)
               ELSE
                  msglen = right_sizes(idata, right_src_vrow, right_src_vcol)
#if defined (__DBCSR_ACC)
                  CALL C_F_POINTER(acc_devmem_cptr(right_buffer_comm%mats( &
                                                   v_ki + 1, 1)%data_area%d%acc_devmem), &
                                   right_data_rp%d%r_dp, (/msglen/))
#endif
                  CALL mp_irecv(right_data_rp%d%r_dp, &
                                right_recv_p, grp, &
                                right_data_rr(v_ki + 1), tag=right_src_vrow)
               END IF
               CALL mp_irecv(right_index_rp, right_recv_p, &
                             grp, right_index_rr(v_ki + 1), tag=right_src_vrow)
               IF (.not. use_acc()) THEN
                  CALL dbcsr_isend_any(right_data_sp, right_send_p, &
                                       grp, right_data_sr(v_ki + 1), tag=right_dst_vrow)
               ELSE
                  msglen = right_sizes(idata, right_dst_vrow, right_dst_vcol)
#if defined (__DBCSR_ACC)
                  CALL C_F_POINTER(acc_devmem_cptr(right_buffer_calc%mats( &
                                                   v_ki + 1, 1)%data_area%d%acc_devmem), &
                                   right_data_sp%d%r_dp, (/msglen/))
#endif
                  CALL mp_isend(right_data_sp%d%r_dp, &
                                right_send_p, grp, &
                                right_data_sr(v_ki + 1), tag=right_dst_vrow)
               END IF
               CALL mp_isend(right_index_sp, right_send_p, &
                             grp, right_index_sr(v_ki + 1), tag=right_dst_vrow)
               dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
               CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), &
                                         dbcsr_data_get_size(right_data_rp), &
                                         data_type_byte, &
                                         dbcsr_mpi_statistics%data_size_breakdown(:, :, 1))
               CALL timestop(handle2)
            END DO
         END IF xfer_right
         !
         ! Left matrix transfer. Transfer in all but the last processor images.
         xfer_left: IF (v_ki_left .EQ. 1 .AND. metronome + left_col_nimages .LT. nvirt_k) THEN
            DO v_ki = 0, left_col_nimages - 1
               ! Calculate the process to send to.
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_send_prow, rowi=left_send_irow, & ! output
                                     pcol=left_send_pcol, coli=left_send_icol, & ! output
                                     vprow=left_send_vrow, vpcol=left_send_vcol, & ! output
                                     myvprow=left_myfirstvrow, & ! nothing happens in the rows
                                     ! go through all my column images
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     ! send to process left_col_nimages left in the grid
                                     vpcol_shift=-left_col_nimages, &
                                     shifting='0')
               ! Calculate which data I send.
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_dst_prow, rowi=left_dst_irow, &
                                     pcol=left_dst_pcol, coli=left_dst_icol, &
                                     vprow=left_dst_vrow, vpcol=left_dst_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     ! go through all my column images
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     vpcol_shift=metronome, &
                                     ! This is with relative shifting.
                                     shifting='L')
               !
               left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=left_data_sp, &
                  rsize=left_sizes(idata, left_dst_vrow, left_dst_vcol), &
                  csize=1, &
                  pointee=left_buffer_calc%mats(1, v_ki + 1)%data_area)
               left_index_sp => left_buffer_calc%mats( &
                                1, v_ki + 1 &
                                )%index(1: &
                                        left_sizes(imeta, left_dst_vrow, left_dst_vcol))
               !
               ! Calculate the process to receive from
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_recv_prow, rowi=left_recv_irow, &
                                     pcol=left_recv_pcol, coli=left_recv_icol, &
                                     vprow=left_recv_vrow, vpcol=left_recv_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     vpcol_shift=+left_col_nimages, & ! just the opposite as "send to"
                                     shifting='0')
               ! Calculate which data I receive
               CALL image_calculator(left_set%image_dist, &
                                     prow=left_src_prow, rowi=left_src_irow, &
                                     pcol=left_src_pcol, coli=left_src_icol, &
                                     vprow=left_src_vrow, vpcol=left_src_vcol, &
                                     myvprow=left_myfirstvrow, &
                                     myvpcol=v_ki + left_myfirstvcol, &
                                     ! receive window moves with the metronome
                                     vpcol_shift=metronome + left_col_nimages, &
                                     shifting='L')
               !
               IF (use_acc()) THEN
                  CALL timeset(routineN//"_acc_sync_left", handle3)
                  CALL acc_event_synchronize(left_buffer_comm%mats(1, v_ki + 1)%data_area%d%acc_ready)
                  CALL timestop(handle3)
               END IF

               left_src_p = left_pgrid(left_src_prow, left_src_pcol)
               CALL dbcsr_data_set_pointer( &
                  area=left_data_rp, &
                  rsize=left_sizes(idata, left_src_vrow, left_src_vcol), &
                  csize=1, &
                  pointee=left_buffer_comm%mats(1, v_ki + 1)%data_area)
               left_index_rp => left_buffer_comm%mats( &
                                1, v_ki + 1 &
                                )%index(1: &
                                        left_sizes(imeta, left_src_vrow, left_src_vcol))
               !
               left_send_p = left_pgrid(left_send_prow, left_send_pcol)
               left_recv_p = left_pgrid(left_recv_prow, left_recv_pcol)
               ! These are column-communicator relative
               IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN
                  left_send_p = left_send_pcol
                  left_recv_p = left_recv_pcol
                  grp = dbcsr_mp_my_row_group(left_mp_obj)
               ELSE
                  grp = dbcsr_mp_group(left_mp_obj)
               END IF
               !
               CALL timeset(routineN//"_metrocomm4", handle2)
               IF (.not. use_acc()) THEN
                  CALL dbcsr_irecv_any(left_data_rp, left_recv_p, &
                                       grp, left_data_rr(v_ki + 1), tag=left_src_vcol)
               ELSE
                  msglen = left_sizes(idata, left_src_vrow, left_src_vcol)
#if defined (__DBCSR_ACC)
                  CALL C_F_POINTER(acc_devmem_cptr(left_buffer_comm%mats( &
                                                   1, v_ki + 1)%data_area%d%acc_devmem), &
                                   left_data_rp%d%r_dp, (/msglen/))
#endif
                  CALL mp_irecv(left_data_rp%d%r_dp, &
                                left_recv_p, grp, &
                                left_data_rr(v_ki + 1), tag=left_src_vcol)
               END IF
               CALL mp_irecv(left_index_rp, left_recv_p, &
                             grp, left_index_rr(v_ki + 1), tag=left_src_vcol)
               IF (.not. use_acc()) THEN
                  CALL dbcsr_isend_any(left_data_sp, left_send_p, &
                                       grp, left_data_sr(v_ki + 1), tag=left_dst_vcol)
               ELSE
                  msglen = left_sizes(idata, left_dst_vrow, left_dst_vcol)
#if defined (__DBCSR_ACC)
                  CALL C_F_POINTER(acc_devmem_cptr(left_buffer_calc%mats( &
                                                   1, v_ki + 1)%data_area%d%acc_devmem), &
                                   left_data_sp%d%r_dp, (/msglen/))
#endif
                  CALL mp_isend(left_data_sp%d%r_dp, &
                                left_send_p, grp, &
                                left_data_sr(v_ki + 1), tag=left_dst_vcol)
               END IF
               CALL mp_isend(left_index_sp, left_send_p, &
                             grp, left_index_sr(v_ki + 1), tag=left_dst_vcol)
               dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1
               CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(2, :), &
                                         dbcsr_data_get_size(left_data_rp), &
                                         data_type_byte, &
                                         dbcsr_mpi_statistics%data_size_breakdown(:, :, 2))
               CALL timestop(handle2)
            END DO
         END IF xfer_left

         ! Do multiplication

         ! If no GPU backend, calculate norms on the CPU
         IF (otf_filtering .and. .not. use_acc()) THEN
            left_norms(:) = huge_norm
            right_norms(:) = huge_norm
            CALL calculate_norms(right_buffer_calc%mats(v_ki_right, 1), &
                                 right_norms, k_sizes, n_sizes)
            CALL calculate_norms(left_buffer_calc%mats(1, v_ki_left), &
                                 left_norms, m_sizes, k_sizes)
         END IF
         !
         flop_single = 0
         threads_finished = 0

!$OMP PARALLEL DEFAULT (NONE) &
!$OMP SHARED (left_buffer_calc, right_buffer_calc, &
!$OMP         v_ki_left, v_ki_right, handle2, handle3, &
!$OMP         product_matrix, multrec,&
!$OMP         filter_eps, right_norms, left_norms, row_max_epss, &
!$OMP         keep_sparsity,threads_finished, &
!$OMP         right_data_sr, right_data_rr, right_index_sr, right_index_rr, &
!$OMP         left_data_sr, left_data_rr, left_index_sr, left_index_rr, &
!$OMP         dbcsr_cfg, k_sizes, nvirt_k, metronome) &
!$OMP PRIVATE (ithread,nthreads,threads_finished_read) &
!$OMP REDUCTION (+: flop_single)
         ithread = 0; nthreads = 1
!$       ithread = omp_get_thread_num(); nthreads = omp_get_num_threads()

         CALL timeset(routineN//"_multrec", handle2)

         CALL dbcsr_mm_multrec_multiply(multrec(ithread)%p, &
                                        left=left_buffer_calc%mats(1, v_ki_left), &
                                        right=right_buffer_calc%mats(v_ki_right, 1), &
                                        flop=flop_single, &
                                        a_norms=left_norms, b_norms=right_norms, &
                                        k_sizes=k_sizes)

         IF (metronome == nvirt_k - 1) THEN
            CALL timeset(routineN//"_multrec_finalize", handle3)
            CALL dbcsr_mm_multrec_finalize(multrec(ithread)%p)
            DEALLOCATE (multrec(ithread)%p)
            CALL timestop(handle3)
         END IF

!$OMP ATOMIC
         threads_finished = threads_finished + 1
         IF (dbcsr_cfg%use_comm_thread%val .AND. (ithread .EQ. 0)) THEN
            DO
! requires OMP 3.1 (e.g. gcc >=4.7), for correctness, otherwise we keep fingers crossed
#if defined _OPENMP && _OPENMP >= 200711
!$OMP                 ATOMIC READ
#endif
               threads_finished_read = threads_finished
               IF (threads_finished_read .EQ. nthreads) EXIT
               ! Using MPI_Testany to trigger forward progress in MPI
               CALL mp_testany(right_data_sr)
               CALL mp_testany(right_data_rr)
               CALL mp_testany(left_data_sr)
               CALL mp_testany(left_data_rr)
               CALL mp_testany(right_index_sr)
               CALL mp_testany(right_index_rr)
               CALL mp_testany(left_index_sr)
               CALL mp_testany(left_index_rr)
            END DO
         END IF
!$OMP BARRIER
         CALL timestop(handle2)

!$OMP END PARALLEL
         flop_total = flop_total + flop_single
         !
         ! Move to the next images
         IF (v_ki_left .EQ. left_col_nimages) THEN
            CALL dbcsr_switch(left_buffer_calc, left_buffer_comm)
         END IF
         IF (v_ki_right .EQ. right_row_nimages) THEN
            CALL dbcsr_switch(right_buffer_calc, right_buffer_comm)
            CALL dbcsr_switch(trs_stackbuf_calc, trs_stackbuf_comm)
         END IF

      END DO grouped_k_index
      CALL timestop(handle1)
      CALL m_memory(mem)
      max_memory = MAX(max_memory, REAL(mem))

      IF (use_acc()) THEN
         CALL dbcsr_data_release(trs_stackbuf_1)
         CALL dbcsr_data_release(trs_stackbuf_2)
         DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes)
         DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes)
         IF (otf_filtering) THEN
            CALL dbcsr_data_release(normsbuf)
            CALL dbcsr_data_release(offsetsbuf)
            CALL dbcsr_data_release(nelemsbuf)
         END IF
      END IF

      IF (ALLOCATED(right_norms)) THEN
         DEALLOCATE (right_norms)
      END IF
      IF (ALLOCATED(left_norms)) THEN
         DEALLOCATE (left_norms)
      END IF
      IF (ALLOCATED(row_max_epss)) THEN
         DEALLOCATE (row_max_epss)
      END IF
      !
      CALL dbcsr_destroy_array(right_buffer_2)
      CALL dbcsr_destroy_array(left_buffer_2)
      DEALLOCATE (my_sizes)
      !
      CALL dbcsr_data_clear_pointer(left_data_sp)
      CALL dbcsr_data_clear_pointer(left_data_rp)
      CALL dbcsr_data_clear_pointer(right_data_sp)
      CALL dbcsr_data_clear_pointer(right_data_rp)
      CALL dbcsr_data_release(left_data_sp)
      CALL dbcsr_data_release(left_data_rp)
      CALL dbcsr_data_release(right_data_sp)
      CALL dbcsr_data_release(right_data_rp)
      !
      DEALLOCATE (left_data_rr, left_data_sr, left_index_rr, left_index_sr, &
                  right_data_rr, right_data_sr, right_index_rr, right_index_sr)
      !
      !
      IF (debug_mod) THEN
         v_ki = 0
         DO i = 1, SIZE(product_matrix%blk_p)
            v_ki = MAX(v_ki, ABS(product_matrix%blk_p(i)))
         END DO
         WRITE (*, *) routineN//" Actual final size", &
            LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), &
            LOG(REAL(v_ki))/LOG(10.0)
      END IF
      !
      flop = flop_total
      DEALLOCATE (left_buffer_2, right_buffer_2)
      DEALLOCATE (m_sizes, n_sizes)
      IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes)
      !
      CALL timestop(handle)
   END SUBROUTINE multiply_cannon_g2g

   SUBROUTINE setup_buffer_matrices(buffer_set, buff_nrows, buff_ncols, &
                                    source_matrix, index_size, data_size)
      TYPE(dbcsr_2d_array_type), INTENT(OUT)             :: buffer_set
      INTEGER, INTENT(IN)                                :: buff_nrows, buff_ncols
      TYPE(dbcsr_type), INTENT(IN)                       :: source_matrix
      INTEGER, INTENT(IN)                                :: index_size
      INTEGER, INTENT(IN), OPTIONAL                      :: data_size

      CHARACTER(len=*), PARAMETER :: routineN = 'setup_buffer_matrices'

      INTEGER                                            :: col_image, handle, row_image

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      CALL dbcsr_image_dist_init(buffer_set%image_dist)
      ALLOCATE (buffer_set%mats(buff_nrows, buff_ncols))
      DO row_image = 1, buff_nrows
         DO col_image = 1, buff_ncols
            CALL setup_buffer_matrix(buffer_set%mats(row_image, col_image), &
                                     source_matrix, index_size, data_size=data_size, &
                                     data_memory_type=memtype_abpanel_2)
         END DO
      END DO
      IF (source_matrix%local_indexing .AND. careful_mod) THEN
         IF (.NOT. array_exists(source_matrix%local_rows)) &
            DBCSR_ABORT("Local rows should exist.")
         IF (.NOT. array_exists(source_matrix%global_rows)) &
            DBCSR_ABORT("Global rows should exist.")
         !
         IF (.NOT. array_exists(source_matrix%local_cols)) &
            DBCSR_ABORT("Local cols should exist.")
         IF (.NOT. array_exists(source_matrix%global_cols)) &
            DBCSR_ABORT("Global cols should exist.")
      END IF
      CALL timestop(handle)
   END SUBROUTINE setup_buffer_matrices

   SUBROUTINE buffer_matrices_ensure_size(buffer_set, index_size, data_size)
      !! Enlarge left_set and right_set to hold any a/b-panel.
      !! left_set and right_set are created by make_images to hold the a/b-panels
      !! used for the initial cannon-tick. This routine ensures that these buffers
      !! can hold any a/b-panel occurring during a matrix multiply and makes them
      !! therefore suitable as buffers for the entire cannon algorithm.
      !! This saves memory since no separate buffers for the first cannon-tick
      !! have to be allocated.

      TYPE(dbcsr_2d_array_type), INTENT(INOUT)           :: buffer_set
      INTEGER, INTENT(IN)                                :: index_size, data_size

      CHARACTER(len=*), PARAMETER :: routineN = 'buffer_matrices_ensure_size'

      INTEGER                                            :: col_image, handle, row_image

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)

      DO row_image = 1, SIZE(buffer_set%mats, 1)
         DO col_image = 1, SIZE(buffer_set%mats, 2)
            CALL dbcsr_data_ensure_size(buffer_set%mats(row_image, col_image)%data_area, &
                                        data_size)
            CALL ensure_array_size(buffer_set%mats(row_image, col_image)%index, &
                                   ub=index_size, &
                                   memory_type=dbcsr_get_index_memory_type(buffer_set%mats(row_image, col_image)))
            CALL dbcsr_repoint_index(buffer_set%mats(row_image, col_image))
         END DO
      END DO
      CALL timestop(handle)
   END SUBROUTINE buffer_matrices_ensure_size

   SUBROUTINE setup_rec_index_2d(matrix_set, n_rows, n_cols)
      TYPE(dbcsr_2d_array_type), INTENT(INOUT)           :: matrix_set
      INTEGER, INTENT(IN)                                :: n_rows, n_cols

      CHARACTER(len=*), PARAMETER :: routineN = 'setup_rec_index_2d'
      LOGICAL, PARAMETER                                 :: dbg = .FALSE.

      INTEGER                                            :: handle, i_col, i_row, t_f, t_l, t_size

!$    INTEGER                                  :: ithread
      LOGICAL                                  :: thread_redist

!   ---------------------------------------------------------------------------

      CALL timeset(routineN, handle)
      DO i_row = 1, n_rows
         DO i_col = 1, n_cols
            IF (.FALSE.) &
               CALL dbcsr_reset_vlocals(matrix_set%mats(i_row, i_col), &
                                        matrix_set%image_dist)
            IF (dbg) THEN
               WRITE (*, *) routineN//" m, n, size", &
                  SIZE(matrix_set%mats(i_row, i_col)%coo_l), &
                  dbcsr_nblkrows_local(matrix_set%mats(i_row, i_col)), &
                  dbcsr_nblkcols_local(matrix_set%mats(i_row, i_col))
               WRITE (*, '(3(1X,I7))') matrix_set%mats(i_row, i_col)%coo_l
            END IF
            IF (careful_mod) THEN
               IF (SIZE(matrix_set%mats(i_row, i_col)%coo_l) .NE. matrix_set%mats(i_row, i_col)%nblks*3) &
                  DBCSR_ABORT("Block count mismatch.")
            END IF
            thread_redist = ASSOCIATED(matrix_set%mats(i_row, i_col)%thr_c)
            t_size = SIZE(matrix_set%mats(i_row, i_col)%coo_l)/3
            t_f = 1
            t_l = t_size
!$OMP       PARALLEL IF (thread_redist) DEFAULT (NONE) &
!$OMP       PRIVATE (ithread) &
!$OMP       FIRSTPRIVATE (t_f, t_l, t_size) &
!$OMP       SHARED (matrix_set, i_row, i_col, thread_redist)
!$          ithread = OMP_GET_THREAD_NUM()
!$          IF (thread_redist) THEN
!$             t_f = matrix_set%mats(i_row, i_col)%thr_c(ithread + 1) + 1
!$             t_l = matrix_set%mats(i_row, i_col)%thr_c(ithread + 2)
!$          END IF
            t_size = t_l - t_f + 1
!$OMP       BARRIER
            IF (t_size .GT. 0) THEN
               CALL call_rec_sort_index( &
                  dbcsr_nblkrows_local(matrix_set%mats(i_row, i_col)), &
                  dbcsr_nblkcols_local(matrix_set%mats(i_row, i_col)), &
                  t_size, &
                  matrix_set%mats(i_row, i_col)%coo_l((t_f*3 - 2):(t_l*3)))
            END IF
!$OMP       END PARALLEL
         END DO
      END DO
      CALL timestop(handle)
   END SUBROUTINE setup_rec_index_2d

   SUBROUTINE call_rec_sort_index(m, n, nblks, idx)
      !! Used to thunk a call to rec_sort_index
      INTEGER, INTENT(IN)                                :: m, n, nblks
      INTEGER, DIMENSION(3, 1:nblks), INTENT(INOUT)      :: idx

!   ---------------------------------------------------------------------------

      CALL rec_sort_index(1, m, 1, n, nblks, idx, 0)
   END SUBROUTINE call_rec_sort_index

   SUBROUTINE dbcsr_switch_sets(set1p, set2p)
      !! Switches pointers between two matrix sets
      TYPE(dbcsr_2d_array_type), POINTER                 :: set1p, set2p

      TYPE(dbcsr_2d_array_type), POINTER                 :: tmp_set

!   ---------------------------------------------------------------------------

      tmp_set => set1p
      set1p => set2p
      set2p => tmp_set
   END SUBROUTINE dbcsr_switch_sets

   SUBROUTINE dbcsr_switch_d_ptrs(area1p, area2p)
      !! Switches pointers between two data areas
      TYPE(dbcsr_data_obj), POINTER                      :: area1p, area2p

      TYPE(dbcsr_data_obj), POINTER                      :: tmp_p

      tmp_p => area1p
      area1p => area2p
      area2p => tmp_p
   END SUBROUTINE dbcsr_switch_d_ptrs

   #:include '../data/dbcsr.fypp'
   #:for n, nametype1, base1, prec1, kind1, type1, dkind1, normname1 in inst_params_float

! **************************************************************************************************
      SUBROUTINE prepare_buffers_${nametype1}$ (negate_real, negate_imaginary, &
                                                iter, row, col, blk, blk_p, bp, &
                                                row_size, col_size, nze, nsymmetries, symmetry_i, &
                                                stored_row, stored_col, tr_row_size, tr_col_size, tr, &
                                                row_img, col_img, nrow_images, ncol_images, &
                                                row_img_dist, col_img_dist, predist_type_fwd, blacs2mpi, &
                                                target_imgdist, prow, pcol, rowi, coli, &
                                                row_dist, col_dist, dst_p, sm_pos, myt_smp, metalen, &
                                                sd_pos, myt_sdp, send_meta, sd_disp, &
                                                data_area, send_data_area, scale_neg_one, scale_value)
     !! Prepare buffers for multiplications
         LOGICAL, INTENT(IN)                                     :: negate_real, negate_imaginary
         TYPE(dbcsr_iterator), INTENT(INOUT)                     :: iter
         INTEGER, INTENT(INOUT)                                  :: row, col, blk, blk_p, row_size, col_size, &
                                                                    nze, bp, symmetry_i, &
                                                                    stored_row, stored_col, tr_row_size, tr_col_size, &
                                                                    row_img, col_img, prow, pcol, rowi, coli, &
                                                                    dst_p, sm_pos, sd_pos
         INTEGER, INTENT(IN)                                     :: nsymmetries, nrow_images, ncol_images, metalen
         LOGICAL, INTENT(INOUT)                                  :: tr
         INTEGER, DIMENSION(:), INTENT(IN), CONTIGUOUS, POINTER  :: row_img_dist, col_img_dist, row_dist, col_dist
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(IN)          :: sd_disp
         INTEGER, DIMENSION(:), ALLOCATABLE, INTENT(INOUT)       :: myt_smp, myt_sdp, send_meta
         TYPE(dbcsr_imagedistribution_obj), INTENT(IN)           :: target_imgdist
         INTEGER, DIMENSION(:, :), INTENT(IN), CONTIGUOUS, POINTER :: blacs2mpi
         CHARACTER, INTENT(IN)                                   :: predist_type_fwd
         ${type1}$, DIMENSION(:), INTENT(IN), CONTIGUOUS         :: data_area
         TYPE(dbcsr_data_obj), INTENT(INOUT)                     :: send_data_area
         TYPE(dbcsr_scalar_type), INTENT(IN)                     :: scale_neg_one
         TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL           :: scale_value

         DO WHILE (dbcsr_iterator_blocks_left(iter))
            CALL dbcsr_iterator_next_block(iter, row, col, blk, blk_p=blk_p, &
                                           row_size=row_size, col_size=col_size)
            nze = row_size*col_size
            IF (nze .EQ. 0) CYCLE
            bp = ABS(blk_p)
            DO symmetry_i = 1, nsymmetries
               IF (symmetry_i .EQ. 1) THEN
                  stored_row = row; stored_col = col; tr = blk_p .LT. 0
                  tr_row_size = col_size; tr_col_size = row_size
               ELSE
                  IF (row .EQ. col) CYCLE
                  stored_row = col; stored_col = row; tr = blk_p .GT. 0
                  tr_row_size = row_size; tr_col_size = col_size
               END IF
               ! Where do we send this block?
               row_img = 1
               IF (nrow_images .GT. 1) row_img = row_img_dist(stored_row)
               col_img = 1
               IF (ncol_images .GT. 1) col_img = col_img_dist(stored_col)
               CALL image_calculator(target_imgdist, &
                                     prow=prow, rowi=rowi, &
                                     pcol=pcol, coli=coli, &
                                     myprow=row_dist(stored_row), myrowi=row_img, &
                                     mypcol=col_dist(stored_col), mycoli=col_img, &
                                     shifting=predist_type_fwd)
               dst_p = blacs2mpi(prow, pcol)
               sm_pos = myt_smp(dst_p)
               myt_smp(dst_p) = myt_smp(dst_p) + metalen
               sd_pos = myt_sdp(dst_p)
               myt_sdp(dst_p) = myt_sdp(dst_p) + nze
               IF (tr) THEN
                  IF (PRESENT(scale_value)) THEN
                     CALL dbcsr_block_transpose(send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1)*scale_value%${base1}$_${prec1}$, &
                                                tr_row_size, tr_col_size)
                  ELSE
                     CALL dbcsr_block_transpose(send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1), &
                                                data_area(bp:bp + nze - 1), &
                                                tr_row_size, tr_col_size)
                  END IF
                  IF (negate_real .AND. negate_imaginary) THEN
                     send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1) = &
                        send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1)*scale_neg_one%${base1}$_${prec1}$
                  ELSEIF (negate_real) THEN
                     #:if nametype1=="s" or nametype1=="d"
                        send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1) = &
                           -send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1)
                     #:else
                        send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1) = &
                           CMPLX( &
                           -REAL(send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1), KIND=${kind1}$), &
                           AIMAG(send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1)), &
                           KIND=${kind1}$)
                     #:endif
                  ELSEIF (negate_imaginary) THEN
                     #:if nametype1=="c" or nametype1=="z"
                        send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1) = &
                           CONJG(send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1))
                     #:endif
                  END IF
               ELSE
                  ! Copy the block
                  IF (PRESENT(scale_value)) THEN
                     send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1) = &
                        data_area(bp:bp + nze - 1)*scale_value%${base1}$_${prec1}$
                  ELSE
                     send_data_area%d%${base1}$_${prec1}$ (sd_pos:myt_sdp(dst_p) - 1) = data_area(bp:bp + nze - 1)
                  END IF
               END IF

               send_meta(sm_pos) = stored_row
               send_meta(sm_pos + 1) = stored_col
               send_meta(sm_pos + 2) = sd_pos - sd_disp(dst_p) + 1
               send_meta(sm_pos + 3) = rowi
               send_meta(sm_pos + 4) = coli
            END DO
         END DO
      END SUBROUTINE prepare_buffers_${nametype1}$
   #:endfor

END MODULE dbcsr_mm_cannon
